diff --git a/.clang-tidy b/.clang-tidy
index 7127535c8e66a377978897492b26cb954201dc20..f0e5933ad55dc18c06f14f2c6ef06dc3226eda22 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -8,6 +8,7 @@ boost-*,
 bugprone-*,
 -bugprone-branch-clone,
 -bugprone-exception-escape,
+-bugprone-easily-swappable-parameters,
 
 misc-*,
 -misc-misplaced-const,
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 77f6a476bd055fdb6db221ffed5571d80627d119..16bca0657013d6a0421540cf3ac1d3b825b382be 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -54,6 +54,7 @@ stages:
         -DWALBERLA_ENABLE_GUI=$WALBERLA_ENABLE_GUI
         -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN
         -DWALBERLA_STL_BOUNDS_CHECKS=$WALBERLA_STL_BOUNDS_CHECKS
+        -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL
       - cmake . -LA
       - make -j $NUM_BUILD_CORES -l $NUM_CORES
       - ctest -LE $CTEST_EXCLUDE_LABELS -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES -T Test
@@ -75,6 +76,7 @@ stages:
       WALBERLA_BUILD_WITH_PARMETIS: "ON"
       WALBERLA_BUILD_WITH_FFTW: "ON"
       WALBERLA_ENABLE_GUI: "OFF"
+      WALBERLA_LOGLEVEL: "DETAIL"
    artifacts:
       when: always
       reports:
@@ -413,7 +415,7 @@ gcc_8_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -437,7 +439,7 @@ gcc_8_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -459,7 +461,7 @@ gcc_8_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -480,7 +482,7 @@ gcc_8_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -505,7 +507,7 @@ gcc_8_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -528,7 +530,7 @@ gcc_8_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -550,7 +552,7 @@ gcc_8_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:8
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -575,7 +577,7 @@ gcc_9_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -599,7 +601,7 @@ gcc_9_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -621,7 +623,7 @@ gcc_9_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -642,7 +644,7 @@ gcc_9_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -667,7 +669,7 @@ gcc_9_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -690,7 +692,7 @@ gcc_9_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -712,7 +714,7 @@ gcc_9_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -737,7 +739,7 @@ gcc_10_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -761,7 +763,7 @@ gcc_10_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -783,7 +785,7 @@ gcc_10_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -804,7 +806,7 @@ gcc_10_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -829,7 +831,7 @@ gcc_10_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -852,7 +854,7 @@ gcc_10_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -874,7 +876,7 @@ gcc_10_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:10
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -899,7 +901,7 @@ gcc_11_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -923,7 +925,7 @@ gcc_11_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -945,7 +947,7 @@ gcc_11_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -966,7 +968,7 @@ gcc_11_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -991,7 +993,7 @@ gcc_11_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1014,7 +1016,7 @@ gcc_11_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1036,7 +1038,7 @@ gcc_11_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:11
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1341,7 +1343,7 @@ clang_11.0_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1365,7 +1367,7 @@ clang_11.0_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1387,7 +1389,7 @@ clang_11.0_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1408,7 +1410,7 @@ clang_11.0_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1433,7 +1435,7 @@ clang_11.0_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1456,7 +1458,7 @@ clang_11.0_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1478,7 +1480,7 @@ clang_11.0_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:11.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1503,7 +1505,7 @@ clang_12.0_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1527,7 +1529,7 @@ clang_12.0_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1549,7 +1551,7 @@ clang_12.0_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1570,7 +1572,7 @@ clang_12.0_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1595,7 +1597,7 @@ clang_12.0_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1618,7 +1620,7 @@ clang_12.0_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1640,7 +1642,7 @@ clang_12.0_hybrid_dbg_sp:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:12.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1665,7 +1667,7 @@ clang_13.0_serial:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1689,7 +1691,7 @@ clang_13.0_mpionly:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1711,7 +1713,7 @@ clang_13.0_hybrid:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1729,7 +1731,7 @@ clang_13.0_serial_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1751,7 +1753,7 @@ clang_13.0_mpionly_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1771,7 +1773,7 @@ clang_13.0_hybrid_dbg:
    extends: .build_template
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1791,7 +1793,7 @@ clang_13.0_hybrid_dbg_sp:
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:13.0
    stage: pretest
    before_script:
-      - pip3 install lbmpy==1.1 jinja2 pytest
+      - pip3 install lbmpy==1.2 jinja2 pytest
       - cd python
       - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
       - pip3 list
@@ -1972,7 +1974,7 @@ clang-tidy:
       - cmake --version
       - mkdir $CI_PROJECT_DIR/build
       - cd $CI_PROJECT_DIR/build
-      - cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWALBERLA_BUFFER_DEBUG=ON -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=ON -DCMAKE_BUILD_TYPE=Debug -DWALBERLA_BUILD_WITH_METIS=ON -DWALBERLA_BUILD_WITH_PARMETIS=ON -DWALBERLA_BUILD_WITH_OPENMESH=ON -DWALBERLA_DOUBLE_ACCURACY=ON
+      - cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWALBERLA_BUFFER_DEBUG=ON -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=ON -DCMAKE_BUILD_TYPE=Debug -DWALBERLA_BUILD_WITH_METIS=ON -DWALBERLA_BUILD_WITH_PARMETIS=ON -DWALBERLA_BUILD_WITH_OPENMESH=ON -DWALBERLA_DOUBLE_ACCURACY=ON -DWALBERLA_LOGLEVEL=DETAIL
       - cmake . -LA
       - utilities/filterCompileCommands.py compile_commands.json
       - run-clang-tidy.py -quiet | tee clang-tidy-output.txt
@@ -2010,7 +2012,7 @@ coverage:
       - mkdir build
       - cd build
       - if dpkg --compare-versions `ompi_info | head -2 | tail -1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/'` ge 1.10; then export MPIEXEC_PREFLAGS="--allow-run-as-root" ; fi
-      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=DebugOptimized -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_BUILD_WITH_CODEGEN=OFF -DWALBERLA_BUILD_WITH_GCOV=ON
+      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=DebugOptimized -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_BUILD_WITH_CODEGEN=OFF -DWALBERLA_BUILD_WITH_GCOV=ON -DWALBERLA_LOGLEVEL=DETAIL
       - cmake . -LA
       - make -j $NUM_BUILD_CORES -l $NUM_CORES
       - ctest -LE longrun --output-on-failure -j $NUM_CORES --timeout 3000
@@ -2051,7 +2053,7 @@ coverage:
       - cmake --version
       - mkdir build
       - cd build
-      - cmake -LA -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_DOUBLE_ACCURACY=$WALBERLA_DOUBLE_ACCURACY -DWARNING_ERROR=ON -G "$CMAKE_GENERATOR" -DCMAKE_DISABLE_FIND_PACKAGE_Boost=TRUE ..
+      - cmake -LA -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_DOUBLE_ACCURACY=$WALBERLA_DOUBLE_ACCURACY -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL -DWARNING_ERROR=ON -G "$CMAKE_GENERATOR" -DCMAKE_DISABLE_FIND_PACKAGE_Boost=TRUE ..
       - MSBuild.exe walberla.sln /property:Configuration=$BUILD_CONFIGURATION /verbosity:minimal /maxcpucount:4
       - ctest -LE $CTEST_EXCLUDE_LABELS -C $BUILD_CONFIGURATION --output-on-failure -j 4 -T Test
    after_script:
@@ -2064,6 +2066,7 @@ coverage:
       WALBERLA_BUILD_WITH_MPI: "ON"
       WALBERLA_BUILD_WITH_OPENMP: "ON"
       WALBERLA_DOUBLE_ACCURACY: "ON"
+      WALBERLA_LOGLEVEL: "INFO"
    artifacts:
       when: always
       reports:
@@ -2239,13 +2242,15 @@ msvc-14.2_mpionly:
       - mpirun --version
       - mkdir build
       - cd build
-      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_PYTHON=$WALBERLA_BUILD_WITH_PYTHON -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_BUILD_WITH_CUDA=$WALBERLA_BUILD_WITH_CUDA -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DWARNING_ERROR=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_PYTHON=$WALBERLA_BUILD_WITH_PYTHON -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_BUILD_WITH_CUDA=$WALBERLA_BUILD_WITH_CUDA -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DWARNING_ERROR=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
       - cmake . -LA
       - make -j $NUM_BUILD_CORES -l $NUM_CORES
       - ctest -LE "$CTEST_EXCLUDE_LABELS|cuda" -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES -T Test
    after_script:
       - pip3 install lxml
       - python3 cmake/ctest2junit.py build > report.xml
+   variables:
+      WALBERLA_LOGLEVEL: "DETAIL"
    tags:
       - macmini
    artifacts:
@@ -2256,7 +2261,7 @@ msvc-14.2_mpionly:
             - python/report.xml
 
 mac_Serial_Dbg:
-   <<: *mac_build_definition
+   extends: .mac_build_template
    variables:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -2266,7 +2271,7 @@ mac_Serial_Dbg:
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
 
 mac_Serial:
-   <<: *mac_build_definition
+   extends: .mac_build_template
    variables:
       CMAKE_BUILD_TYPE: "Release"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -2276,7 +2281,7 @@ mac_Serial:
       WALBERLA_BUILD_WITH_CODEGEN: "ON"
 
 mac_MpiOnly_Dbg:
-   <<: *mac_build_definition
+   extends: .mac_build_template
    variables:
       CMAKE_BUILD_TYPE: "DebugOptimized"
       CTEST_EXCLUDE_LABELS: "longrun"
@@ -2287,7 +2292,7 @@ mac_MpiOnly_Dbg:
       OMPI_MCA_btl: "self,tcp"
 
 mac_MpiOnly:
-   <<: *mac_build_definition
+   extends: .mac_build_template
    variables:
       CMAKE_BUILD_TYPE: "Release"
       CTEST_EXCLUDE_LABELS: "longrun"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94c89858523f668f989bdbd0c2dd8764fdcbda6f..c8e6947b5ab0b95d5016e45e749a91a607992e6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,11 @@
   - Add support for more shapes, e.g., convex polyhedron
 - MESA_PD:
    - Add extensive application for dense particle packing generation
+- AMD - HIP support
+  - Support of the ROCm Toolchain and thus AMD HIP as second GPU language
+  - All CUDA related files, namespaces, folders etc are renamed to gpu.
+  - Include "GPUWrapper.h" to use general GPU functions cudaMalloc -> gpuMalloc
+  - WALBERLA_BUILD_WITH_HIP and WALBERLA_BUILD_WITH_GPU_SUPPORT as new CMake variables introduced
 
 ### Changed
 - Update and extend phase-field LBM showcases
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 468a1bd7af1726b60238f1e2e856a8a055a97098..77369d876b90e89761e33e8d12fca0547882fafc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 ## Contents:
 ##   - definition of build options
 ##   - compiler variables ( c++ standard, warnings etc. )
-##   - Finding of service libraries. Required: none, Optional: Boost, MPI, FFTW3, METIS, OpenMesh, Python
+##   - Finding of service libraries. Required: none, Optional: MPI, FFTW3, METIS, OpenMesh, Python
 ##     the include paths are set, and the libraries are added to variable SERVICE_LIBS
 ##   - Subdirectory cmake lists are called
 ##       -> src/   this folder contains all modules, each module (that contains c or cpp files) is linked to a
@@ -82,6 +82,7 @@ option ( WALBERLA_BUILD_WITH_CODEGEN        "Enable pystencils code generation"
 option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr"              )
 
 option ( WALBERLA_BUILD_WITH_CUDA	        "Enable CUDA support"                                )
+option ( WALBERLA_BUILD_WITH_HIP	           "Enable ROCm HIP support"                            )
 
 
 option ( WALBERLA_BUILD_WITH_FASTMATH       "Fast math"                                          )
@@ -101,6 +102,8 @@ option ( WALBERLA_LOG_SKIPPED               "Log skipped cmake targets"
 
 option ( WALBERLA_GIT_SUBMODULE_AUTO        "Check submodules during cmake run"               ON )
 
+option ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT "Experimental half precision support"    OFF )
+
 # Installation Directory
 set ( CMAKE_INSTALL_PREFIX /usr/local/waLBerla CACHE STRING "The default installation directory."   )
 
@@ -219,8 +222,7 @@ else()
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_NEC )
 
-# Check for Clang compiler
-if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER MATCHES "hipcc" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
     option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" ON  )
 else()
     option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" OFF  )
@@ -379,7 +381,9 @@ if( WALBERLA_OPTIMIZE_FOR_LOCALHOST )
       endif()
 
       if( EXISTS "/proc/sys/abi/sve_default_vector_length" )
-        file( READ "/proc/sys/abi/sve_default_vector_length" SVE_LENGTH )
+        file( READ "/proc/sys/abi/sve_default_vector_length" SVE_LENGTH_BYTES )
+        string(STRIP "${SVE_LENGTH_BYTES}" SVE_LENGTH_BYTES)
+        math(EXPR SVE_LENGTH "${SVE_LENGTH_BYTES} * 8")
         add_flag ( CMAKE_CXX_FLAGS "-msve-vector-bits=${SVE_LENGTH}" )
         add_flag ( CMAKE_C_FLAGS   "-msve-vector-bits=${SVE_LENGTH}" )
       endif()
@@ -474,18 +478,18 @@ endif()
 
 
 # disable Xcode 7.3+ linker deduplication pass to speed up linking in debug mode
-if ( APPLE )
-   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
-   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   list( GET LINKER_VERSION 0 LINKER_TYPE )
-   list( GET LINKER_VERSION 1 LINKER_VERSION )
-   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
-       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
-       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-   endif()
-endif()
+#if ( APPLE )
+#   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
+#   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   list( GET LINKER_VERSION 0 LINKER_TYPE )
+#   list( GET LINKER_VERSION 1 LINKER_VERSION )
+#   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
+#       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#   endif()
+#endif()
 
 
 ############################################################################################################################
@@ -603,7 +607,7 @@ endif ()
 ##
 #############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CODEGEN )
-   set(LBMPY_MIN_VERSION 1.1)
+   set(LBMPY_MIN_VERSION 1.2)
    execute_process(COMMAND ${Python_EXECUTABLE} -c "import lbmpy; print(lbmpy.__version__)"
          RESULT_VARIABLE LBMPY_FOUND OUTPUT_VARIABLE LBMPY_VERSION)
     if(NOT LBMPY_FOUND EQUAL 0)
@@ -700,53 +704,6 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
 endif()
 
 
-############################################################################################################################
-##
-## BOOST Libraries
-##
-#############################################################################################################################
-set ( Boost_NO_BOOST_CMAKE ON)
-
-# This variable is necessary, if the CMAKE version used is not aware of a more recent boost version (keep this up to date!)
-set ( Boost_ADDITIONAL_VERSIONS
-      "1.45" "1.45.0" "1.46" "1.46.0" "1.46.1" "1.47" "1.47.0" "1.48" "1.48.0" "1.49" "1.49.0"
-      "1.50" "1.50.0" "1.51" "1.51.0" "1.52" "1.52.0" "1.53" "1.53.0" "1.54" "1.54.0" "1.55" "1.55.0"
-      "1.56" "1.56.0" "1.57" "1.57.0" "1.58" "1.58.0" "1.59" "1.59.0" "1.60" "1.60.0" "1.61" "1.61.0" "1.62" "1.62.0" "1.63" "1.63.0"
-      "1.64.0" "1.65.0" "1.65.1" "1.66.0" "1.67.0" "1.68.0" "1.69.0" "1.70.0" "1.71.0" "1.72.0" "1.73.0" "1.74.0")
-
-# if you defined BOOST_ROOT or BOOST_BASE in your environment use it here to find boost too
-if ( NOT BOOST_ROOT )
-   foreach ( var  BOOST_ROOT  BOOST_BASE )
-      if ( NOT "$ENV{${var}}" STREQUAL "" )
-         message ( STATUS "Use environment boost directory: $ENV{${var}}" )
-         set ( BOOST_ROOT $ENV{${var}} CACHE INTERNAL "")
-         break ( )
-      endif ( )
-   endforeach ( )
-endif ( )
-
-find_package ( Boost )
-
-if ( Boost_FOUND )
-   if(CMAKE_GENERATOR STREQUAL "Xcode")
-      # this is needed because the SYSTEM flag to include_directories does not work
-      add_flag ( CMAKE_CXX_FLAGS "-isystem ${Boost_INCLUDE_DIRS}" )
-   else()
-      include_directories ( SYSTEM ${Boost_INCLUDE_DIRS} )
-   endif()
-   add_definitions ( -DBOOST_ALL_NO_LIB ) # Disable Boost auto-linking (CMAKE does that for us...)
-
-   set( WALBERLA_BUILD_WITH_BOOST TRUE CACHE INTERNAL "Build with Boost" )
-else( Boost_FOUND )
-   set( WALBERLA_BUILD_WITH_BOOST FALSE CACHE INTERNAL "Build with Boost" )
-endif( Boost_FOUND )
-
-
-############################################################################################################################
-
-
-
-
 
 ############################################################################################################################
 ##
@@ -891,41 +848,24 @@ endif(WALBERLA_ENABLE_GUI)
 ##
 ############################################################################################################################
 
+if ( WALBERLA_BUILD_WITH_PARMETIS )
+   # metis is required for parmetis
+   set( WALBERLA_BUILD_WITH_METIS TRUE FORCE )
+endif ()
+
 if ( WALBERLA_BUILD_WITH_METIS )
-    find_package ( Metis QUIET )
+   find_package( Metis REQUIRED )
 
-    if ( METIS_FOUND )
-        include_directories( ${METIS_INCLUDE_DIRS} )
-        link_directories   ( ${METIS_LIBRARY_DIR}  )
-        list ( APPEND SERVICE_LIBS ${METIS_LIBRARIES} )
-        set  ( WALBERLA_BUILD_WITH_METIS TRUE )
-    else()
-        set  ( WALBERLA_BUILD_WITH_METIS OFF CACHE BOOL "Build with metis graph partitioner" FORCE )
-    endif()
-else()
-    set ( METIS_FOUND OFF CACHE BOOL "Metis found" FORCE )
-endif()
+   include_directories( ${METIS_INCLUDE_DIRS} )
+   list( APPEND SERVICE_LIBS ${METIS_LIBRARIES} )
 
+endif()
 
 if ( WALBERLA_BUILD_WITH_PARMETIS )
-   find_path(PARMETIS_INCLUDE_DIR parmetis.h
-      /usr/local/include
-      /usr/include
-      ${PARMETIS_ROOT}/include
-      $ENV{PARMETIS_ROOT}/include
-   )
-
-  find_library(PARMETIS_LIBRARY parmetis
-    /usr/local/lib
-    /usr/lib
-    ${PARMETIS_ROOT}/lib
-    $ENV{PARMETIS_ROOT}/lib
-  )
-
-  if( PARMETIS_INCLUDE_DIR AND PARMETIS_LIBRARY AND METIS_LIBRARY )
-    include_directories( ${PARMETIS_INCLUDE_DIR} )
-    list ( APPEND SERVICE_LIBS ${PARMETIS_LIBRARY} ${METIS_LIBRARY} )
-  endif()
+   find_package( Parmetis REQUIRED )
+
+   include_directories( ${PARMETIS_INCLUDE_DIR} )
+   list( APPEND SERVICE_LIBS ${PARMETIS_LIBRARY} )
 endif()
 
 ############################################################################################################################
@@ -1098,6 +1038,10 @@ endif()
 ##
 ############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CUDA )
+    if (WALBERLA_BUILD_WITH_HIP)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
+    endif()
+
     include(CheckLanguage)
     check_language(CUDA)
     if( CMAKE_CUDA_COMPILER )
@@ -1119,6 +1063,7 @@ if ( WALBERLA_BUILD_WITH_CUDA )
 
         #CUDA_FOUND is need for our cmake mechanism
         set ( CUDA_FOUND TRUE )
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
     else()
         message( WARNING "CUDA could not be enabled. The host compiler might not be compatible. Check CMakeFiles/CMakeError.log for more information" )
         set ( WALBERLA_BUILD_WITH_CUDA FALSE )
@@ -1146,6 +1091,38 @@ endif()
 
 
 
+############################################################################################################################
+##
+## ROCm HIP
+##
+############################################################################################################################
+if ( WALBERLA_BUILD_WITH_HIP )
+    if (WALBERLA_BUILD_WITH_CUDA)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
+    endif()
+    if (${CMAKE_VERSION} VERSION_LESS "3.21.0")
+       message(FATAL_ERROR "For HIP support CMake > 3.21.0 is needed. Please install a newer version")
+    endif()
+
+    include(CheckLanguage)
+    check_language(HIP)
+
+    if( CMAKE_HIP_COMPILER )
+        enable_language(HIP)
+        # since waLBerla also supports CUDA we only use HIP on an AMD platform
+        add_compile_definitions(__HIP_PLATFORM_AMD__)
+        # include_directories(${HSA_HEADER})
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
+    else()
+         message("HIP compiler not found. HIP support is not possible")
+        set ( WALBERLA_BUILD_WITH_HIP FALSE )
+    endif ( )
+endif ( )
+
+############################################################################################################################
+
+
+
 ############################################################################################################################
 ##
 ##  Testing Coverage
@@ -1249,6 +1226,34 @@ if ( WALBERLA_SANITIZE_UNDEFINED )
     endif()
 endif()
 
+############################################################################################################################
+##
+##  Half precision
+##
+############################################################################################################################
+if (WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT)
+    if (WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG)
+        message(STATUS "Configuring with *experimental* half precision (float16) support. You better know what you are doing.")
+        if (WALBERLA_CXX_COMPILER_IS_GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "Half precision support for gcc has only been tested with version >= 12. "
+                    "You are using a previous version - it may not work correctly.")
+        endif ()
+        if (WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "Half precision support for clang has only been tested with version >= 15. "
+                    "You are using a previous version - it may not work correctly.")
+        endif ()
+        if (NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!")
+        endif ()
+    else ()
+        message(FATAL_ERROR "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                "Half precision support is currently only available for gcc and clang.")
+    endif ()
+endif ()
+
 ############################################################################################################################
 # Documentation Generation
 #
diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
index e6a1f4f6f9c405151b6c764ebdefab5859ee1b66..38d1d9ad4a159c369acd85733884898a084e87b7 100644
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
@@ -1492,17 +1492,17 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add velocity field and utility
-   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::zyxf, uint_t(2) );
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::fzyx, uint_t(2) );
 
    using VelocityFieldWriter_T = lbm::VelocityFieldWriter<PdfField_T, VelocityField_T>;
    BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp
index bbba3ba7060a5eafc7ad5c868b8a50e977d6b8d0..2f738db33c3e0a0c3e1600a579f40873c38fa459 100644
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp
@@ -977,17 +977,17 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add velocity field and utility
-   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::zyxf, uint_t(2) );
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::fzyx, uint_t(2) );
 
    using VelocityFieldWriter_T = lbm::VelocityFieldWriter<PdfField_T, VelocityField_T>;
    BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
index 01f0b3f99a4b998b53a0d9f431442db5916ce1d9..a12727e32f47e7c57c3ccbf7714cff7aa6daf3b9 100644
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
@@ -682,15 +682,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 3f5e6a95a0f0c6a8199de013da440e37e7f78afb..f37d24767eb383e55b1ff2764770ee525bb54c68 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -25,10 +25,12 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
       add_subdirectory( FlowAroundSphereCodeGen )
       add_subdirectory( UniformGridCPU )
       add_subdirectory( PhaseFieldAllenCahn )
+      add_subdirectory( NonUniformGridCPU )
    endif()
 
-   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_CUDA )
+   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
       add_subdirectory( UniformGridGPU )
+      add_subdirectory( NonUniformGridGPU )
    endif()
 
 endif()
diff --git a/apps/benchmarks/CouetteFlow/CouetteFlow.cpp b/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
index b313738d067f0f0ad6501f4e1776d3e0ed9f2bf9..1f3ea1b7dfade52e614a916a40261415e6874984 100644
--- a/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
+++ b/apps/benchmarks/CouetteFlow/CouetteFlow.cpp
@@ -773,7 +773,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
index faaaf44dc20de92da16b0333a08ab26886ab9fd7..17cfd93fd6f8c1efce2a82fcd6ad24dfd14c76bf 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt
@@ -11,10 +11,10 @@ waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated
         FlowAroundSphereCodeGen_PackInfoOdd.${CODEGEN_FILE_SUFFIX} FlowAroundSphereCodeGen_PackInfoOdd.h
         FlowAroundSphereCodeGen_InfoHeader.h)
 
-if (WALBERLA_BUILD_WITH_CUDA)
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
     waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
-            DEPENDS blockforest boundary core cuda domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
+            DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
 else ()
     waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp
             DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk FlowAroundSphereGenerated)
-endif (WALBERLA_BUILD_WITH_CUDA)
\ No newline at end of file
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
\ No newline at end of file
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
index bdd1ccbe13995ea6632c6013fd9e54dae9b1f6bc..08e5de4d928d9a23e76104823721dfbb5d811f69 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -37,13 +37,13 @@
 #include "timeloop/all.h"
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
-#   include "cuda/NVTX.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/HostFieldAllocator.h"
+#   include "gpu/NVTX.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/GPUPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #endif
 
 // CodeGen includes
@@ -58,7 +58,7 @@ typedef walberla::uint8_t flag_t;
 typedef FlagField< flag_t > FlagField_T;
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 using namespace std::placeholders;
@@ -134,7 +134,7 @@ int main(int argc, char** argv)
 {
    walberla::Environment walberlaEnv(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 #endif
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -157,8 +157,8 @@ int main(int argc, char** argv)
       const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
       const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
 
-      const double remainingTimeLoggerFrequency =
-         parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+      const real_t remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
       // create fields
       BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
@@ -166,11 +166,11 @@ int main(int argc, char** argv)
       BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      BlockDataID pdfFieldIDGPU = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
+      BlockDataID pdfFieldIDGPU = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true);
       BlockDataID velFieldIDGPU =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true);
       BlockDataID densityFieldIDGPU =
-         cuda::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true);
 #endif
 
       BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
@@ -180,7 +180,7 @@ int main(int argc, char** argv)
       pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU);
       for (auto& block : *blocks)
          setterSweep(&block);
-      cuda::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
+      gpu::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU);
 #else
       pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID);
       for (auto& block : *blocks)
@@ -192,11 +192,11 @@ int main(int argc, char** argv)
       // This way of using alternating pack infos is temporary and will soon be replaced
       // by something more straight-forward
 
-      cuda::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
+      gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
       comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
       auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); });
 
-      cuda::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
+      gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
       comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
       auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); });
 #else
@@ -270,8 +270,8 @@ int main(int argc, char** argv)
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
          vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
-            cuda::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
+            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU);
+            gpu::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU);
          });
 #endif
          auto velWriter     = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity");
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
index c170a8101422dadce166196b011444f4faf08ccb..7dd9d531b9730e9851e0f8cf53b7b48c4ae930a0 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -47,6 +47,7 @@ with CodeGeneration() as ctx:
                                                    pdfs=pdfs, density=1.0,
                                                    streaming_pattern=streaming_pattern,
                                                    previous_timestep=timesteps[0])
+    setter_assignments = setter_assignments.new_without_unused_subexpressions()
 
     # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
 
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
index 41d38d16218d97a633ccca62c951356b16c2f446..673c10e4d7a2a04117d2cb3a25ab1999d94311bd 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
@@ -4,10 +4,10 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
 
 class Scenario:
     def __init__(self):
-        self.timesteps = 1001
+        self.timesteps = 10
         self.vtkWriteFrequency = 100
 
-        self.cells = (384, 128, 128)
+        self.cells = (64, 32, 32)
         self.blocks = (1, 1, 1)
         self.periodic = (0, 0, 0)
 
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp
index afba732e96685175d890d80fc7e2d4d6f8091948..e902cd667770314243c2d4558ec91ae7258f05e0 100644
--- a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp
@@ -1051,14 +1051,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, particleFieldID, accessor ), "boundary handling" );
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp
index 423af69ce7fdbca2aef2bc7f9e5663ef2051901e..61d8f4e7cedd3fdff4f412d32a9926ca4070f074 100644
--- a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp
@@ -610,15 +610,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/apps/benchmarks/ForcesOnSphereNearPlaneInShearFlow/ForcesOnSphereNearPlaneInShearFlow.cpp b/apps/benchmarks/ForcesOnSphereNearPlaneInShearFlow/ForcesOnSphereNearPlaneInShearFlow.cpp
index 15ccbda0cdb8a5a929f927fd994e4e10f72a0aea..f504c47507b7e1152bda7d64ed11be055f25c17d 100644
--- a/apps/benchmarks/ForcesOnSphereNearPlaneInShearFlow/ForcesOnSphereNearPlaneInShearFlow.cpp
+++ b/apps/benchmarks/ForcesOnSphereNearPlaneInShearFlow/ForcesOnSphereNearPlaneInShearFlow.cpp
@@ -604,14 +604,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID ), "boundary handling" );
diff --git a/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp b/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
index 11935abb7fe7283e4e98d53f5e51aaad7c9b568f..2f911818bfc0b33a51592c8fe4e5a098f015fa3f 100644
--- a/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
+++ b/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
@@ -902,20 +902,20 @@ int main( int argc, char **argv )
 
    // add PDF field
    // initial velocity in domain = inflow velocity
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel, uInfty, real_t(1), uint_t(1), field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel, uInfty, real_t(1), uint_t(1), field::fzyx );
 
    // add PDF field (needed to store pre collision values for MEM_MR scheme)
-   BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "nqOdd field (zyxf)", latticeModel, uInfty, real_t(1), uint_t(1), field::zyxf );
+   BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "nqOdd field (fzyx)", latticeModel, uInfty, real_t(1), uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add body and volume fraction field
    BlockDataID bodyAndVolumeFractionFieldID = field::addToStorage< BodyAndVolumeFractionField_T >( blocks, "body and volume fraction field",
-                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::zyxf, 0 );
+                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::fzyx, 0 );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f332e065fed35fa99367127e9d44b211849cc7b3
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -0,0 +1,15 @@
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+
+waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
+        FILE NonUniformGridCPU.py
+        OUT_FILES NonUniformGridCPUStorageSpecification.h NonUniformGridCPUStorageSpecification.cpp
+        NonUniformGridCPUSweepCollection.h NonUniformGridCPUSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        NonUniformGridCPUBoundaryCollection.h
+        NonUniformGridCPUInfoHeader.h)
+waLBerla_add_executable( NAME NonUniformGridCPU
+                         FILES NonUniformGridCPU.cpp
+                         DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated )
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a44867523a9b94f8f2f9b57bc8b5aeb6aac6819
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -0,0 +1,311 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridCPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <cmath>
+
+#include "NonUniformGridCPUInfoHeader.h"
+
+using namespace walberla;
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using FlagField_T          = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::NonUniformGridCPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::NonUniformGridCPUSweepCollection;
+
+using blockforest::communication::NonUniformBufferedScheme;
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest)
+   {
+      std::vector< SetupBlock* > blocks;
+      forest.getBlocks(blocks);
+
+      for (auto block : blocks)
+      {
+         if (forest.atDomainYMaxBorder(*block))
+         {
+            if (block->getLevel() < refinementDepth_) { block->setMarker(true); }
+         }
+      }
+   }
+};
+
+class LDC
+{
+ public:
+   LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); }
+   RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         Block& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+};
+
+static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses()))
+{
+   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
+   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+
+   auto refSelection = ldcSetup.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), numProcesses);
+}
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto domainSetup = config->getOneBlock("DomainSetup");
+
+      // Reading parameters
+      auto parameters              = config->getOneBlock("Parameters");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false);
+      const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false);
+      const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses");
+
+      auto ldc = std::make_shared< LDC >(refinementDepth);
+      SetupBlockForest setupBfs;
+      if (writeSetupForestAndReturn)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes")
+         WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+         createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses);
+
+         WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+         for (uint_t level = 0; level <= refinementDepth; level++)
+         {
+            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+         }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Ending program")
+         return EXIT_SUCCESS;
+      }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+      createSetupBlockForest(setupBfs, domainSetup, *ldc);
+
+      // Create structured block forest
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
+      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      auto blocks =
+         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blocks->createCellBoundingBoxes();
+
+      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+      }
+
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      const BlockDataID pdfFieldID =
+         lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx);
+      const BlockDataID velFieldID =
+         field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2));
+      const BlockDataID densityFieldID =
+         field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2));
+      const BlockDataID flagFieldID =
+         field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+      const Cell innerOuterSplit =
+         Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+      SweepCollection_T sweepCollection(blocks, pdfFieldID, densityFieldID, velFieldID, omega, innerOuterSplit);
+      for (auto& block : *blocks)
+      {
+         sweepCollection.initialise(&block, 2);
+      }
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      const FlagUID fluidFlagUID("Fluid");
+      ldc->setupBoundaryFlagField(*blocks, flagFieldID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 2);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldID, fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+      auto communication = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+      auto packInfo      = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldID);
+      communication->addPackInfo(packInfo);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement(
+         blocks, pdfFieldID, sweepCollection, boundaryCollection, communication, packInfo);
+
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      if(benchmarkKernelOnly){
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+      }
+      else{
+         LBMMeshRefinement.addRefinementToTimeLoop(timeLoop);
+      }
+
+      // VTK
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         vtkOutput->addBeforeFunction([&]() {
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+         });
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
+      {
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID );
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)")
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_LOG_INFO_ON_ROOT("Starting benchmark with " << timesteps << " time steps")
+      simTimer.start();
+      timeLoop.run(timeloopTiming);
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Benchmark finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+   }
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b350b6c9c48e0418244101cb3de1daec26c34ce
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
@@ -0,0 +1,68 @@
+import sympy as sp
+
+import pystencils as ps
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+    stencil = LBStencil(Stencil.D3Q19)
+
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="NonUniformGridCPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.CPU)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': lbm_config.method.name.lower(),
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    generate_info_header(ctx, 'NonUniformGridCPUInfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de18d9f0f6ed8ef684eddee74f4712c8f72c852
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
@@ -0,0 +1,57 @@
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, domain_size=(32, 32, 32), root_blocks=(2, 2, 2),
+                 cells_per_block=(16, 16, 16)):
+
+        self.domain_size = domain_size
+        self.root_blocks = root_blocks
+        self.cells_per_block = cells_per_block
+
+        self.periodic = (0, 0, 0)
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'domainSize': self.domain_size,
+                'rootBlocks': self.root_blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic
+            },
+            'Parameters': {
+                'omega': 1.95,
+                'timesteps': 101,
+
+                'refinementDepth': 1,
+                'writeSetupForestAndReturn': False,
+                'numProcesses': 1,
+
+                'benchmarkKernelOnly': False,
+
+                'remainingTimeLoggerFrequency': 3,
+
+                'vtkWriteFrequency': 50,
+            }
+        }
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario()
+    scenarios.add(scenario)
+
+
+validation_run()
diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6840007e14d5f5af685bb5b262c8bcfd6138d6e
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
@@ -0,0 +1,15 @@
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+
+waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
+        FILE NonUniformGridGPU.py
+        OUT_FILES NonUniformGridGPUStorageSpecification.h NonUniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
+        NonUniformGridGPUSweepCollection.h NonUniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
+        NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+        UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+        NonUniformGridGPUBoundaryCollection.h
+        NonUniformGridGPUInfoHeader.h)
+waLBerla_add_executable( NAME NonUniformGridGPU
+                         FILES NonUniformGridGPU.cpp
+                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa3905b4236295275d82e2e4aad91be4ddcbb5ba
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -0,0 +1,361 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/NonUniformGPUScheme.h"
+
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <cmath>
+
+#include "NonUniformGridGPUInfoHeader.h"
+using namespace walberla;
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using FlagField_T = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection;
+
+using gpu::communication::NonUniformGPUScheme;
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest)
+   {
+      std::vector< SetupBlock* > blocks;
+      forest.getBlocks(blocks);
+
+      for (auto block : blocks)
+      {
+         if (forest.atDomainYMaxBorder(*block))
+         {
+            if (block->getLevel() < refinementDepth_) { block->setMarker(true); }
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); }
+   RefinementSelectionFunctor refinementSelector()
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         Block& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
+
+static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses()))
+{
+   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
+   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+
+   auto refSelection = ldcSetup.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+}
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+   gpu::selectDeviceBasedOnMpiRank();
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto domainSetup              = config->getOneBlock("DomainSetup");
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      // Reading parameters
+      auto parameters          = config->getOneBlock("Parameters");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
+      const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false);
+      const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false);
+      const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses");
+
+      auto ldc = std::make_shared< LDC >(refinementDepth );
+      SetupBlockForest setupBfs;
+      if (writeSetupForestAndReturn)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes")
+         WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+         createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses);
+
+         WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+         uint_t totalCellUpdates( 0.0 );
+         for (uint_t level = 0; level <= refinementDepth; level++)
+         {
+            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+            const uint_t numberOfCells = numberOfBlocks * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+            totalCellUpdates += timesteps * math::uintPow2(level)  * numberOfCells;
+            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+         }
+         cudaDeviceProp prop;
+         WALBERLA_GPU_CHECK(gpuGetDeviceProperties(&prop, 0))
+
+         const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+
+         const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+         const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+         const uint_t sizePerValue = sizeof(PdfField_T::value_type);
+         const double totalGPUMem = double_c(prop.totalGlobalMem) * 1e-9;
+         const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+
+         WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+         WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+         WALBERLA_LOG_INFO_ON_ROOT( "The total cell updates after " << timesteps << " timesteps (on the coarse level) will be " << totalCellUpdates)
+         WALBERLA_LOG_INFO_ON_ROOT( "Total GPU memory " << totalGPUMem)
+
+         WALBERLA_LOG_INFO_ON_ROOT("Ending program")
+         return EXIT_SUCCESS;
+      }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+      createSetupBlockForest(setupBfs, domainSetup, *ldc);
+
+      // Create structured block forest
+      WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
+      auto bfs    = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blocks->createCellBoundingBoxes();
+
+      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+      }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Start field allocation")
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      auto allocator = make_shared< gpu::HostFieldAllocator<real_t> >();
+      const BlockDataID pdfFieldCpuID  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx, allocator);
+      const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+      const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
+      const BlockDataID velFieldGpuID =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
+      const BlockDataID densityFieldGpuID =
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
+      WALBERLA_LOG_INFO_ON_ROOT("Finished field allocation")
+
+      const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
+      Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
+      for (auto& iBlock : *blocks)
+      {
+         sweepCollection.initialise(&iBlock, 2, nullptr);
+      }
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      const FlagUID fluidFlagUID("Fluid");
+      ldc->setupBoundaryFlagField(*blocks, flagFieldID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 2);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+      auto communication = std::make_shared< NonUniformGPUScheme <CommunicationStencil_T>> (blocks, cudaEnabledMPI);
+      auto packInfo = lbm_generated::setupNonuniformGPUPdfCommunication<GPUPdfField_T>(blocks, pdfFieldGpuID);
+      communication->addPackInfo(packInfo);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      int streamHighPriority = 0;
+      int streamLowPriority  = 0;
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+      sweepCollection.setOuterPriority(streamHighPriority);
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+
+      lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo);
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      // LBMMeshRefinement.test(5);
+      // return EXIT_SUCCESS;
+
+      if(benchmarkKernelOnly){
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+      }
+      else{
+         LBMMeshRefinement.addRefinementToTimeLoop(timeLoop);
+      }
+
+      // VTK
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         vtkOutput->addBeforeFunction([&]() {
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+            gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+         });
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
+      {
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID );
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)")
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      simTimer.start();
+      timeLoop.run(timeloopTiming);
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+   }
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..d523b5c0c1b8dfcbfa1cf112c0342edfdee03c7d
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
@@ -0,0 +1,79 @@
+import sympy as sp
+import numpy as np
+
+import pystencils as ps
+from pystencils.typing import TypedSymbol
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+compile_time_block_size = False
+max_threads = 256
+
+sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                    TypedSymbol("cudaBlockSize1", np.int32),
+                    TypedSymbol("cudaBlockSize2", np.int32))
+
+gpu_indexing_params = {'block_size': sweep_block_size}
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+ 
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'pull'
+    timesteps = get_timesteps(streaming_pattern)
+    stencil = LBStencil(Stencil.D3Q19)
+
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="NonUniformGridGPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         max_threads=max_threads)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': lbm_config.method.name.lower(),
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    generate_info_header(ctx, 'NonUniformGridGPUInfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05852fd1934c71ea67d6cce3a8ae3f4cc80e61a
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -0,0 +1,66 @@
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
+                 cells_per_block=(32, 32, 32), refinement_depth=0):
+
+        self.domain_size = domain_size
+        self.root_blocks = root_blocks
+        self.cells_per_block = cells_per_block
+        self.refinement_depth = refinement_depth
+
+        self.periodic = (0, 0, 0)
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'domainSize': self.domain_size,
+                'rootBlocks': self.root_blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic
+            },
+            'Parameters': {
+                'omega': 1.95,
+                'timesteps': 1501,
+
+                'refinementDepth': self.refinement_depth,
+                'writeSetupForestAndReturn': False,
+                'numProcesses': 1,
+
+                'cudaEnabledMPI': False,
+                'benchmarkKernelOnly': False,
+
+                'remainingTimeLoggerFrequency': 3,
+
+                'vtkWriteFrequency': 500,
+            }
+        }
+
+        if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+
+    domain_size = (64, 64, 64)
+    cells_per_block = (32, 32, 32)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=1)
+    scenarios.add(scenario)
+
+
+validation_run()
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
index e998b35efa9e5e01dc20e7dc91f4791321ae58b0..1b530d61a14ca8d84cbc1f3d9c28ea873258f7a0 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -12,13 +12,13 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
         PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h
         GenDefines.h)
 
-if (WALBERLA_BUILD_WITH_CUDA)
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core cuda field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 else ()
     waLBerla_add_executable(NAME benchmark_multiphase
             FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
             DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
-endif (WALBERLA_BUILD_WITH_CUDA)
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
 
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index b757f3a5b306e9a7dc75e38da5551ad3098c483a..c8992a65afb93fa7dae572959a654651f14aabde 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -44,11 +44,11 @@
 ////////////////////////////
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/MemcpyPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/MemcpyPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #else
 #   include <blockforest/communication/UniformBufferedScheme.h>
 #endif
@@ -61,14 +61,14 @@ using flag_t      = walberla::uint8_t;
 using FlagField_T = FlagField< flag_t >;
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 int main(int argc, char** argv)
 {
    mpi::Environment env(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 #endif
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -95,14 +95,14 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
-         cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
 #else
       BlockDataID lb_phase_field =
          field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
@@ -128,7 +128,7 @@ int main(int argc, char** argv)
             initPhaseField_RTI(blocks, phase_field);
          }
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-         cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+         gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
 #endif
          WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field done")
       }
@@ -154,7 +154,7 @@ int main(int argc, char** argv)
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
       auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
@@ -162,7 +162,7 @@ int main(int argc, char** argv)
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
 
       auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
@@ -183,7 +183,7 @@ int main(int argc, char** argv)
       Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
 #endif
 
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
       auto boundariesConfig = config->getBlock("Boundaries_GPU");
@@ -206,10 +206,10 @@ int main(int argc, char** argv)
       }
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
+      int const streamLowPriority  = 0;
+      int const streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 #endif
 
       auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
@@ -296,14 +296,14 @@ int main(int argc, char** argv)
             timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
             "remaining time logger");
 
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 1)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
          vtkOutput->addBeforeFunction(
-            [&]() { cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
+            [&]() { gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
 #endif
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
          vtkOutput->addCellDataWriter(phaseWriter);
diff --git a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
index 2acb0a54620bef5d8ab2fce03f23255cabc08c91..6f21d16ad34beffce6a711ae27d4adb42cc4b049 100644
--- a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
+++ b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
@@ -890,7 +890,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
index 87802191ac7148e92174cbdeeca2e1013240b096..3c75d4ab686d7f55021fe12bf844c54b74901da3 100644
--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -2363,7 +2363,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
    // add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure)
 
    using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>;
-   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::zyxf, FieldGhostLayers, true, None, Empty );
+   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::fzyx, FieldGhostLayers, true, None, Empty );
 
    using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>;
    BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty );
@@ -2623,7 +2623,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
 
    // remaining time logger
 
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
    timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
 
    // logging right before the simulation starts
diff --git a/apps/benchmarks/UniformGridCPU/CMakeLists.txt b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
index a2f06826e40553f9c157c5b5e5200ba8ed2b26b2..0d159bc542c6ada48999dace8e2b7dce4a085519 100644
--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
@@ -15,13 +15,11 @@ foreach(streaming_pattern pull push aa esotwist)
             waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
                     FILE UniformGridCPU.py
                     CODEGEN_CFG ${config}
-                    OUT_FILES   UniformGridCPU_LbKernel.cpp UniformGridCPU_LbKernel.h
-                    UniformGridCPU_PackInfoEven.cpp UniformGridCPU_PackInfoEven.h
-                    UniformGridCPU_PackInfoOdd.cpp UniformGridCPU_PackInfoOdd.h
-                    UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h
-                    UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h
-                    UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h
-                    UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h
+                    OUT_FILES UniformGridCPUStorageSpecification.h UniformGridCPUStorageSpecification.cpp
+                    UniformGridCPUSweepCollection.h UniformGridCPUSweepCollection.cpp
+                    NoSlip.cpp NoSlip.h
+                    UBB.cpp UBB.h
+                    UniformGridCPUBoundaryCollection.h
                     UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
                     UniformGridCPU_InfoHeader.h
                     )
diff --git a/apps/benchmarks/UniformGridCPU/InitShearVelocity.h b/apps/benchmarks/UniformGridCPU/InitShearVelocity.h
index 9a6c7d1db63a5a7ad53376ed33f56851a806dafe..fd13a03b6d89ed30007969ecb2b77f27d015d8a6 100644
--- a/apps/benchmarks/UniformGridCPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridCPU/InitShearVelocity.h
@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
                                                          Cell globalCell;
         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
         velField->get(x, y, z, 1) = real_t(0);
         velField->get(x, y, z, 2) = randomReal;
 
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index 9916a71e18fa09d490a2b4bdb16029136a6106a3..64d94ce3d0dd843b29e693d446485bac73b84119 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -34,7 +34,10 @@
 
 #include "geometry/InitBoundaryHandling.h"
 
-#include "lbm/communication/CombinedInPlaceCpuPackInfo.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
 
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -50,21 +53,20 @@
 
 using namespace walberla;
 
-using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven;
-using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd;
-using LbSweep = lbm::UniformGridCPU_LbKernel;
+using StorageSpecification_T = lbm::UniformGridCPUStorageSpecification;
+using Stencil_T = lbm::UniformGridCPUStorageSpecification::Stencil;
 
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
 using FlagField_T = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::UniformGridCPUBoundaryCollection< FlagField_T >;
 
-auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
-   return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
-                         storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
-                         make_shared< field::AllocateAligned< real_t, 64 > >());
-};
+using SweepCollection_T = lbm::UniformGridCPUSweepCollection;
+
+using blockforest::communication::UniformBufferedScheme;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
+   const mpi::Environment env(argc, argv);
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
@@ -74,8 +76,6 @@ int main(int argc, char** argv)
       logging::configureLogging(config);
       auto blocks = blockforest::createUniformBlockGridFromConfig(config);
 
-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
       auto parameters          = config->getOneBlock("Parameters");
       const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
@@ -83,9 +83,12 @@ int main(int argc, char** argv)
       const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
 
       // Creating fields
-      BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs");
-      BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
-      BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      auto fieldAllocator = make_shared< field::AllocateAligned< real_t, 64 > >();
+      const BlockDataID pdfFieldId  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator);
+      const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
+      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
 
       // Initialize velocity on cpu
       if (initShearFlow)
@@ -94,157 +97,76 @@ int main(int argc, char** argv)
          initShearVelocity(blocks, velFieldId);
       }
 
-      pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId);
-      pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId);
+      const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
+      SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega, innerOuterSplit);
 
-      // Set up initial PDF values
       for (auto& block : *blocks)
-         setterSweep(&block);
-
-      Vector3< int > innerOuterSplit =
-         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
-
-      for (uint_t i = 0; i < 3; ++i)
       {
-         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
-         {
-            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
-         }
+         sweepCollection.initialise(&block);
       }
-      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
 
-      LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell);
-      pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
+      const pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
 
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
       auto boundariesConfig   = config->getBlock("Boundaries");
-      bool boundaries         = false;
       if (boundariesConfig)
       {
          WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
-         boundaries = true;
          geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
-         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
-
-      lbm::UniformGridCPU_NoSlip noSlip(blocks, pdfFieldId);
-      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
-
-      lbm::UniformGridCPU_UBB ubb(blocks, pdfFieldId);
-      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldId, fluidFlagUID);
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                           COMMUNICATION SCHEME                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      // Initial setup is the post-collision state of an even time step
-      auto tracker = make_shared< lbm::TimestepTracker >(0);
-      auto packInfo =
-         make_shared< lbm::CombinedInPlaceCpuPackInfo< PackInfoEven_T , PackInfoOdd_T > >(tracker, pdfFieldId);
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId);
+      UniformBufferedScheme< Stencil_T > communication(blocks);
       communication.addPackInfo(packInfo);
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                          TIME STEP DEFINITIONS                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
 
-      auto boundarySweep = [&](IBlock* block, uint8_t t) {
-         noSlip.run(block, t);
-         ubb.run(block, t);
-      };
-
-      auto boundaryInner = [&](IBlock* block, uint8_t t) {
-         noSlip.inner(block, t);
-         ubb.inner(block, t);
-      };
-
-      auto boundaryOuter = [&](IBlock* block, uint8_t t) {
-         noSlip.outer(block, t);
-         ubb.outer(block, t);
-      };
-
-      auto simpleOverlapTimeStep = [&]() {
-         // Communicate post-collision values of previous timestep...
-         communication.startCommunication();
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundaryInner(&block, tracker->getCounter());
-            lbSweep.inner(&block, tracker->getCounterPlusOne());
-         }
-         communication.wait();
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundaryOuter(&block, tracker->getCounter());
-            lbSweep.outer(&block, tracker->getCounterPlusOne());
-         }
-
-         tracker->advance();
-      };
-
-      auto normalTimeStep = [&]() {
-         communication.communicate();
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundarySweep(&block, tracker->getCounter());
-            lbSweep(&block, tracker->getCounterPlusOne());
-         }
-
-         tracker->advance();
-      };
-
-      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
-      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
-      auto kernelOnlyFunc = [&]() {
-         tracker->advance();
-         for (auto& block : *blocks)
-            lbSweep(&block, tracker->getCounter());
-      };
-
-      // Stream only function to test a streaming pattern without executing lbm operations inside
-      auto StreamOnlyFunc = [&]() {
-         for (auto& block : *blocks)
-            StreamOnlyKernel(&block);
-      };
+      if (timeStepStrategy == "noOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication, "communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+         }else {
+            timeLoop.add() << BeforeFunction(communication, "communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");}
+
+      } else if (timeStepStrategy == "simpleOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");
+         }else{
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");}
+
+      } else if (timeStepStrategy == "kernelOnly") {
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+      } else if (timeStepStrategy == "StreamOnly") {
+         timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only");
+      } else {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
+      }
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                             TIME LOOP SETUP                                                ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
-
-      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "noOverlap")
-         timeStep = std::function< void() >(normalTimeStep);
-      else if (timeStepStrategy == "simpleOverlap")
-         timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly")
-      {
-         WALBERLA_LOG_INFO_ON_ROOT(
-            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
-         // Run initial communication once to provide any missing stream-in populations
-         communication.communicate();
-         timeStep = kernelOnlyFunc;
-      }
-      else if (timeStepStrategy == "StreamOnly")
-      {
-         WALBERLA_LOG_INFO_ON_ROOT(
-            "Running only streaming kernel without LBM - this makes only sense for benchmarking!")
-         // Run initial communication once to provide any missing stream-in populations
-         timeStep = StreamOnlyFunc;
-      }
-      else
-      {
-         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
-                                      "'simpleOverlap', 'kernelOnly'")
-      }
-
-      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
-
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -254,7 +176,7 @@ int main(int argc, char** argv)
 
          vtkOutput->addBeforeFunction([&]() {
            for (auto& block : *blocks){
-              getterSweep(&block);}
+              sweepCollection.calculateMacroscopicParameters(&block);}
          });
 
          timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
@@ -263,46 +185,50 @@ int main(int argc, char** argv)
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                               BENCHMARK                                                    ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
 
-      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
-      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
-      for (int i = 0; i < warmupSteps; ++i)
+      const uint_t warmupSteps     = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
+      const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
+      for (uint_t i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      real_t remainingTimeLoggerFrequency =
+      auto remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
       if (remainingTimeLoggerFrequency > 0)
       {
-         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * outerIterations,
                                                    remainingTimeLoggerFrequency);
          timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
       }
 
-      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
       {
          timeLoop.setCurrentTimeStepToZero();
+
+         WcTimingPool timeloopTiming;
          WcTimer simTimer;
+
+         WALBERLA_MPI_WORLD_BARRIER()
          WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+
          simTimer.start();
-         timeLoop.run();
+         timeLoop.run(timeloopTiming);
          simTimer.end();
+
          WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-         auto time      = real_c(simTimer.last());
-         WALBERLA_MPI_SECTION()
-         {
-            walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
-         }
-         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+         double time = simTimer.max();
+         WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+         performance.logResultOnRoot(timesteps, time);
+
+         const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+         WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
 
-         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
-         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
          WALBERLA_ROOT_SECTION()
          {
             python_coupling::PythonCallback pythonCallbackResults("results_callback");
             if (pythonCallbackResults.isCallable())
             {
-               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
                pythonCallbackResults.data().exposeValue("stencil", infoStencil);
                pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
                pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
index cba55fac4675c18d8f25f10541de2138002b1208..cd1a36114788a0ad440f89d750abc8af26109eda 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
@@ -6,19 +6,17 @@ import pystencils as ps
 from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
     insert_symbol_times_minus_one
 
-from lbmpy.advanced_streaming import Timestep, is_inplace
+from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
 from lbmpy.enums import Method, Stencil
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+from lbmpy.moments import get_default_moment_set_for_stencil
 from lbmpy.updatekernels import create_stream_only_kernel
 
-from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
-    generate_mpidtype_info_from_kernel, generate_info_header
-
-from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary, generate_lb_pack_info
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 
 omega = sp.symbols('omega')
 omega_free = sp.Symbol('omega_free')
@@ -121,15 +119,17 @@ with CodeGeneration() as ctx:
 
     options = options_dict[collision_setup]
 
-    q = stencil.Q
-    dim = stencil.D
-    assert dim == 3, "This app supports only three-dimensional stencils"
-    pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx')
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
     density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
     lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
     lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
 
+    if lbm_config.method == Method.CENTRAL_MOMENT:
+        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
+
     if not is_inplace(streaming_pattern):
         lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
         field_swaps = [(pdfs, pdfs_tmp)]
@@ -153,46 +153,22 @@ with CodeGeneration() as ctx:
         collision_rule = insert_aliases(collision_rule)
         collision_rule = insert_symbol_times_minus_one(collision_rule)
 
-    lb_method = collision_rule.method
-
-    generate_alternating_lbm_sweep(ctx, 'UniformGridCPU_LbKernel', collision_rule, lbm_config=lbm_config,
-                                   lbm_optimisation=lbm_opt, target=ps.Target.CPU,
-                                   inner_outer_split=True, field_swaps=field_swaps,
-                                   cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
-    
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method,
-                                                   density=density_field.center, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs,
-                                                   streaming_pattern=streaming_pattern,
-                                                   previous_timestep=Timestep.EVEN)
-    getter_assignments = macroscopic_values_getter(lb_method,
-                                                   density=density_field, velocity=velocity_field,
-                                                   pdfs=pdfs,
-                                                   streaming_pattern=streaming_pattern,
-                                                   previous_timestep=Timestep.EVEN)
-
-    generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
-    generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="UniformGridCPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=False, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
 
     # Stream only kernel
     generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
                    target=ps.Target.CPU, cpu_openmp=openmp)
 
-    # Boundaries
-    noslip = NoSlip()
-    ubb = UBB((0.05, 0, 0), data_type=field_type)
-
-    generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
-                                      streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
-    generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_UBB', ubb, lb_method, field_name=pdfs.name,
-                                      streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
-
-    # communication
-    generate_lb_pack_info(ctx, 'UniformGridCPU_PackInfo', stencil, pdfs,
-                          streaming_pattern=streaming_pattern, target=ps.Target.CPU,
-                          always_generate_separate_classes=True)
-
     infoHeaderParams = {
         'stencil': stencil_str,
         'streaming_pattern': streaming_pattern,
@@ -201,13 +177,10 @@ with CodeGeneration() as ctx:
         'cse_pdfs': int(lbm_opt.cse_pdfs),
     }
 
-    stencil_typedefs = {'Stencil_T': stencil,
-                        'CommunicationStencil_T': stencil}
-    field_typedefs = {'PdfField_T': pdfs,
-                      'VelocityField_T': velocity_field,
+    field_typedefs = {'VelocityField_T': velocity_field,
                       'ScalarField_T': density_field}
 
     # Info header containing correct template definitions for stencil and field
     generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
-                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         field_typedefs=field_typedefs,
                          additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py b/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c4aa08ec2c2328be7d102d4f377a2cd754dc8af
--- /dev/null
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/PizDaintJobScript.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+import os
+from waLBerla.tools.config import block_decomposition
+
+
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=01:00:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=s1042
+
+source ~/env.sh
+
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+
+cd {folder}
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+
+ulimit -c 0
+"""
+
+job_script_exe_part = """
+
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+
+streaming_patterns = ['pull', 'push', 'aa', 'esotwist']
+stencils = ['d3q27', 'd3q19']
+methods = ['srt', 'mrt', 'cumulant', 'entropic']
+
+all_executables = []
+
+for stencil in stencils:
+    for streaming_pattern in streaming_patterns:
+        for method in methods:
+            all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}")
+
+all_executables = tuple(all_executables)
+
+
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
+                                                     config='../communication_compare.py')
+
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
index f432e778bc8e7d5c82120db40469ed7d2f2aa7ed..9acab66da85c8f5477251e66bc7a9ea37ccc2fd7 100755
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -9,13 +9,15 @@ from math import prod
 # Number of time steps run for a workload of 128^3 per process
 # if double as many cells are on the process, half as many time steps are run etc.
 # increase this to get more reliable measurements
-TIME_STEPS_FOR_128_BLOCK = 5
+TIME_STEPS_FOR_128_BLOCK = 10
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
 
 
 def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
     cells = block_size[0] * block_size[1] * block_size[2]
     time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < TIME_STEPS_FOR_128_BLOCK:
+        time_steps = 5
     return int(time_steps)
 
 
@@ -39,7 +41,7 @@ class Scenario:
             init_shear_flow = False
             periodic = (0, 0, 0)
 
-        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses())
 
         self.cells_per_block = cells_per_block
         self.periodic = periodic
@@ -66,6 +68,7 @@ class Scenario:
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic,
+                'oneBlockPerProcess': False
             },
             'Parameters': {
                 'omega': self.omega,
@@ -176,6 +179,7 @@ def single_node_benchmark():
     for block_size in block_sizes:
         scenario = Scenario(cells_per_block=block_size,
                             time_step_strategy='kernelOnly',
+                            outer_iterations=1,
                             timesteps=num_time_steps(block_size))
         scenarios.add(scenario)
 
@@ -185,26 +189,26 @@ def validation_run():
     wlb.log_info_on_root("Validation run")
     wlb.log_info_on_root("")
 
-    time_step_strategy = 'simpleOverlap'  # 'noOverlap'
+    time_step_strategy = "noOverlap"  # "noOverlap"
 
     scenarios = wlb.ScenarioManager()
     scenario = Scenario(cells_per_block=(64, 64, 64),
                         time_step_strategy=time_step_strategy,
-                        timesteps=101,
+                        timesteps=201,
                         outer_iterations=1,
                         warmup_steps=0,
-                        init_shear_flow=True,
-                        boundary_setup=False,
-                        vtk_write_frequency=100,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=50,
                         remaining_time_logger_frequency=10)
     scenarios.add(scenario)
 
 
 wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
 # Select the benchmark you want to run
-single_node_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# single_node_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
 # performance of compute kernel (no communication)
 # overlap_benchmark()  # benchmarks different communication overlap options
 # profiling()  # run only two timesteps on a smaller domain for profiling only
-# validation_run()
+validation_run()
 # scaling_benchmark()
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 890d77f124479016436f5b0de7ba3f9eb49384fa..66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
@@ -15,19 +14,18 @@ foreach(streaming_pattern pull push aa esotwist)
             waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                     FILE UniformGridGPU.py
                     CODEGEN_CFG ${config}
-                    OUT_FILES   UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
-                    UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
-                    UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
-                    UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
-                    UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
-                    UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
-                    UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
+                    OUT_FILES UniformGridGPUStorageSpecification.h UniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
+                    UniformGridGPUSweepCollection.h UniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
+                    NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+                    UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+                    UniformGridGPUBoundaryCollection.h
+                    UniformGridGPU_StreamOnlyKernel.h UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX}
                     UniformGridGPU_InfoHeader.h
                     )
 
             waLBerla_add_executable(NAME UniformGridGPU_${config}
                     FILES UniformGridGPU.cpp
-                    DEPENDS blockforest boundary core cuda domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
+                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
 
             # all configs are excluded from all except for pull d3q27.
             if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
diff --git a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
index 9a6c7d1db63a5a7ad53376ed33f56851a806dafe..fd13a03b6d89ed30007969ecb2b77f27d015d8a6 100644
--- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
                                                          Cell globalCell;
         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
         velField->get(x, y, z, 1) = real_t(0);
         velField->get(x, y, z, 2) = randomReal;
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index 4d2ee1afaf27a1bf73514a11c3ab19bd092bd40b..ee022f457738fb6f8aa71f615441e9279fd25eca 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -27,21 +27,26 @@
 #include "core/timing/RemainingTimeLogger.h"
 #include "core/timing/TimingPool.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
-
 #include "field/AddToStorage.h"
 #include "field/FlagField.h"
-#include "field/communication/PackInfo.h"
 #include "field/vtk/VTKWriter.h"
 
 #include "geometry/InitBoundaryHandling.h"
 
-#include "lbm/inplace_streaming/TimestepTracker.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
 
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -53,20 +58,31 @@
 
 #include "InitShearVelocity.h"
 #include "UniformGridGPU_InfoHeader.h"
+
 using namespace walberla;
 
+using StorageSpecification_T = lbm::UniformGridGPUStorageSpecification;
+using Stencil_T = lbm::UniformGridGPUStorageSpecification::Stencil;
+
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
 using FlagField_T = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::UniformGridGPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::UniformGridGPUSweepCollection;
+
+using gpu::communication::UniformGPUScheme;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
-   cuda::selectDeviceBasedOnMpiRank();
+   mpi::Environment const env(argc, argv);
+   gpu::selectDeviceBasedOnMpiRank();
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
       WALBERLA_MPI_WORLD_BARRIER()
 
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                        SETUP AND CONFIGURATION                                             ///
@@ -76,18 +92,21 @@ int main(int argc, char** argv)
       logging::configureLogging(config);
       auto blocks = blockforest::createUniformBlockGridFromConfig(config);
 
-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
       auto parameters          = config->getOneBlock("Parameters");
       const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
       const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
+      const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
 
       // Creating fields
-      BlockDataID pdfFieldCpuID =
-         field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_c(std::nan("")), field::fzyx);
-      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      const BlockDataID pdfFieldCpuID  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
+
+      auto allocator = make_shared< gpu::HostFieldAllocator<real_t> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
+      const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(1), allocator);
+      const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(1), allocator);
+      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
 
       // Initialize velocity on cpu
       if (initShearFlow)
@@ -96,181 +115,92 @@ int main(int argc, char** argv)
          initShearVelocity(blocks, velFieldCpuID);
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
-      // Velocity field is copied to the GPU
-      BlockDataID velFieldGpuID =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
-
-      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
+      const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
+      const BlockDataID velFieldGpuID =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
+      const BlockDataID densityFieldGpuID =
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
 
-      // Set up initial PDF values
+      const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
+      Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
       for (auto& block : *blocks)
-         setterSweep(&block);
-
-      Vector3< int > innerOuterSplit =
-         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
-
-      for (uint_t i = 0; i < 3; ++i)
       {
-         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
-         { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
+         sweepCollection.initialise(&block);
       }
 
-      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
-      bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
-      Vector3< int32_t > gpuBlockSize =
-         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
-
       int streamHighPriority = 0;
       int streamLowPriority  = 0;
-      WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
-
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+      sweepCollection.setOuterPriority(streamHighPriority);
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-      using LbSweep      = lbm::UniformGridGPU_LbKernel;
-      using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
-      using PackInfoOdd  = lbm::UniformGridGPU_PackInfoOdd;
-      using cuda::communication::UniformGPUScheme;
-
-      LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
-      lbSweep.setOuterPriority(streamHighPriority);
-
-      pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
-                                                                   gpuBlockSize[2]);
+      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
 
       // Boundaries
       const FlagUID fluidFlagUID("Fluid");
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
       auto boundariesConfig   = config->getBlock("Boundaries");
-      bool boundaries         = false;
       if (boundariesConfig)
       {
-         boundaries = true;
+         WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
          geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
-         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
-
-      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
-      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
-
-      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
-      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
-
-      // Initial setup is the post-collision state of an even time step
-      auto tracker = make_shared< lbm::TimestepTracker >(0);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                           COMMUNICATION SCHEME                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
-      auto packInfo =
-         make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
-      comm.addPackInfo(packInfo);
+      UniformGPUScheme< Stencil_T > communication(blocks, cudaEnabledMPI);
+      auto packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUPdfField_T >>(pdfFieldGpuID);
+      communication.addPackInfo(packInfo);
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                          TIME STEP DEFINITIONS                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
 
-      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-
-      auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
-         noSlip.run(block, t, stream);
-         ubb.run(block, t, stream);
-      };
-
-      auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
-         noSlip.inner(block, t, stream);
-         ubb.inner(block, t, stream);
-      };
-
-      auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
-         noSlip.outer(block, t, stream);
-         ubb.outer(block, t, stream);
-      };
-
-      auto simpleOverlapTimeStep = [&]() {
-         // Communicate post-collision values of previous timestep...
-         comm.startCommunication(defaultStream);
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
-            lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
-         }
-         comm.wait(defaultStream);
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
-            lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
-         }
-
-         tracker->advance();
-      };
-
-      auto normalTimeStep = [&]() {
-         comm.communicate(defaultStream);
-         for (auto& block : *blocks)
-         {
-            if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
-            lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
-         }
-
-         tracker->advance();
-      };
-
-      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
-      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
-      auto kernelOnlyFunc = [&]() {
-         tracker->advance();
-         for (auto& block : *blocks)
-            lbSweep(&block, tracker->getCounter(), defaultStream);
-      };
-
-      // Stream only function to test a streaming pattern without executing lbm operations inside
-      auto StreamOnlyFunc = [&]() {
-         for (auto& block : *blocks)
-            StreamOnlyKernel(&block, defaultStream);
-      };
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+
+      if (timeStepStrategy == "noOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
+         }else {
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");}
+
+      } else if (timeStepStrategy == "simpleOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame");
+         }else{
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");}
+
+      } else if (timeStepStrategy == "kernelOnly") {
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
+      } else if (timeStepStrategy == "StreamOnly") {
+         timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only");
+      } else {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
+      }
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                             TIME LOOP SETUP                                                ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
-
-      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "noOverlap")
-         timeStep = std::function< void() >(normalTimeStep);
-      else if (timeStepStrategy == "simpleOverlap")
-         timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly")
-      {
-         WALBERLA_LOG_INFO_ON_ROOT(
-            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
-         // Run initial communication once to provide any missing stream-in populations
-         comm.communicate();
-         timeStep = kernelOnlyFunc;
-      }
-      else if (timeStepStrategy == "StreamOnly")
-      {
-         WALBERLA_LOG_INFO_ON_ROOT(
-            "Running only streaming kernel without LBM - this makes only sense for benchmarking!")
-         // Run initial communication once to provide any missing stream-in populations
-         timeStep = StreamOnlyFunc;
-      }
-      else
-      {
-         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
-                                      "'simpleOverlap', 'kernelOnly'")
-      }
-
-      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
-
       // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -279,7 +209,9 @@ int main(int argc, char** argv)
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+            gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
          });
          timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
@@ -288,12 +220,13 @@ int main(int argc, char** argv)
       ///                                               BENCHMARK                                                    ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
-      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
-      for (int i = 0; i < warmupSteps; ++i)
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
+      const uint_t warmupSteps     = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
+      const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
+      for (uint_t i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      double remainingTimeLoggerFrequency =
+      auto remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
       if (remainingTimeLoggerFrequency > 0)
       {
@@ -302,32 +235,36 @@ int main(int argc, char** argv)
          timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
       }
 
-      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
       {
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
          timeLoop.setCurrentTimeStepToZero();
+         WcTimingPool const timeloopTiming;
          WcTimer simTimer;
-         cudaDeviceSynchronize();
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+         WALBERLA_GPU_CHECK( gpuPeekAtLastError() )
          WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
          simTimer.start();
          timeLoop.run();
-         cudaDeviceSynchronize();
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          simTimer.end();
+
          WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-         auto time      = real_c(simTimer.last());
-         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+         double time = simTimer.max();
+         WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+         performance.logResultOnRoot(timesteps, time);
+
+         const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+         WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
 
-         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
-         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
          WALBERLA_ROOT_SECTION()
          {
             python_coupling::PythonCallback pythonCallbackResults("results_callback");
             if (pythonCallbackResults.isCallable())
             {
-               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
                pythonCallbackResults.data().exposeValue("stencil", infoStencil);
                pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
                pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
@@ -339,6 +276,5 @@ int main(int argc, char** argv)
          }
       }
    }
-
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index e8fa9906aa1ae005af20f0f77178fb054a528161..3d7579e5bcb3f3713f59a9afd94d7fed790c21e9 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -8,22 +8,21 @@ from pystencils.typing import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 
 from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
-from lbmpy.advanced_streaming import Timestep, is_inplace
+from lbmpy.advanced_streaming import is_inplace
 from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
-from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 from lbmpy.moments import get_default_moment_set_for_stencil
 from lbmpy.updatekernels import create_stream_only_kernel
 from lbmpy.fieldaccess import *
 
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
-from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
 compile_time_block_size = False
-max_threads = None
+max_threads = 256
 
 if compile_time_block_size:
     sweep_block_size = (128, 1, 1)
@@ -124,11 +123,10 @@ with CodeGeneration() as ctx:
 
     options = options_dict[collision_setup]
 
-    q = stencil.Q
-    dim = stencil.D
-    assert dim == 3, "This app supports only three-dimensional stencils"
-    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
-                                               layout='fzyx')
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
     lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
     lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
@@ -142,12 +140,6 @@ with CodeGeneration() as ctx:
     else:
         field_swaps = []
 
-    vp = [
-        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1'),
-        ('int32_t', 'cudaBlockSize2')
-    ]
-
     # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
     # is_inplace is set to False to ensure that the streaming is done with src and dst field.
     # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
@@ -165,38 +157,25 @@ with CodeGeneration() as ctx:
 
     lb_method = collision_rule.method
 
-    generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, lbm_config=lbm_config,
-                                   lbm_optimisation=lbm_opt, target=ps.Target.GPU,
-                                   gpu_indexing_params=gpu_indexing_params,
-                                   inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps,
-                                   max_threads=max_threads)
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
 
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs,
-                                                   streaming_pattern=streaming_pattern,
-                                                   previous_timestep=Timestep.EVEN)
-    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU, max_threads=max_threads)
+    generate_lbm_package(ctx, name="UniformGridGPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=False, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         max_threads=max_threads)
 
     # Stream only kernel
+    vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')]
     generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
                    gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
                    max_threads=max_threads)
 
-    # Boundaries
-    noslip = NoSlip()
-    ubb = UBB((0.05, 0, 0), data_type=field_type)
-
-    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
-                                      streaming_pattern=streaming_pattern, target=ps.Target.GPU)
-    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name,
-                                      streaming_pattern=streaming_pattern, target=ps.Target.GPU)
-
-    # communication
-    generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs,
-                          streaming_pattern=streaming_pattern, target=ps.Target.GPU,
-                          always_generate_separate_classes=True)
-
     infoHeaderParams = {
         'stencil': stencil_str,
         'streaming_pattern': streaming_pattern,
@@ -205,12 +184,10 @@ with CodeGeneration() as ctx:
         'cse_pdfs': int(lbm_opt.cse_pdfs),
     }
 
-    stencil_typedefs = {'Stencil_T': stencil,
-                        'CommunicationStencil_T': stencil}
-    field_typedefs = {'PdfField_T': pdfs,
-                      'VelocityField_T': velocity_field}
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
 
     # Info header containing correct template definitions for stencil and field
     generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
-                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         field_typedefs=field_typedefs,
                          additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
index 48fca7135ba30b8f546b421c1329e847e0b6d129..f6199b1ec6a1dcade811d79e67b3555f26332c9d 100644
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
@@ -14,15 +14,15 @@
 #include "timeloop/all.h"
 #include "core/math/Random.h"
 #include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/NVTX.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/NVTX.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/DeviceSelectMPI.h"
 #include "domain_decomposition/SharedSweep.h"
 
 #include "UniformGridGPU_LatticeModel.h"
@@ -48,7 +48,7 @@ const auto Q = LatticeModel_T::Stencil::Q;
 using Stencil_T = LatticeModel_T::Stencil;
 using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
 using PdfField_T = GhostLayerField<real_t, Q>;
-using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+using CommScheme_T = gpu::communication::UniformGPUScheme<CommunicationStencil_T>;
 using VelocityField_T = GhostLayerField<real_t, 3>;
 using flag_t = walberla::uint8_t;
 using FlagField_T = FlagField<flag_t>;
@@ -56,7 +56,7 @@ using FlagField_T = FlagField<flag_t>;
 int main( int argc, char **argv )
 {
    mpi::Environment env( argc, argv );
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 
    for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
    {
@@ -96,7 +96,7 @@ int main( int argc, char **argv )
           initialComm();
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID pdfFieldGpuID = gpu::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
       BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
 
 
@@ -155,13 +155,13 @@ int main( int argc, char **argv )
                                                     gpuBlockSize[0], gpuBlockSize[1],
                                                     Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
       lbKernel.setOuterPriority( streamHighPriority );
-      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
+      UniformGridGPU_Communication< CommunicationStencil_T, gpu::GPUField< double > >
          gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
 
-      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
+      auto defaultStream = gpu::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = gpu::ParallelStreams( streamHighPriority );
 
       uint_t currentTimeStep = 0;
 
@@ -177,12 +177,12 @@ int main( int argc, char **argv )
 
       auto overlapTimeStep = [&]()
       {
-         cuda::NvtxRange namedRange("timestep");
+         gpu::NvtxRange namedRange("timestep");
          auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
 
          innerOuterSection.run([&]( auto innerStream )
          {
-            cuda::nameStream(innerStream, "inner stream");
+            gpu::nameStream(innerStream, "inner stream");
             for( auto &block: *blocks )
             {
                if(!disableBoundaries)
@@ -197,7 +197,7 @@ int main( int argc, char **argv )
 
          innerOuterSection.run([&]( auto outerStream )
          {
-            cuda::nameStream(outerStream, "outer stream");
+            gpu::nameStream(outerStream, "outer stream");
             gpuComm( outerStream );
 
             for( auto &block: *blocks )
@@ -215,7 +215,7 @@ int main( int argc, char **argv )
       };
 
 
-      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryStreams = gpu::ParallelStreams( streamHighPriority );
       auto normalTimeStep = [&]()
       {
          gpuComm();
@@ -268,7 +268,7 @@ int main( int argc, char **argv )
          auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
          vtkOutput->addBeforeFunction( [&]() {
-             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             gpu::fieldCpy<PdfField_T, gpu::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
              for( auto & block : *blocks )
                  getterSweep( &block );
          });
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
index aadf51331d507ef9bb12f3fc010606f18b599491..20c301bd3b5cfecc37f332442f3dc2396a7771da 100644
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
@@ -7,9 +7,9 @@
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/communication/MemcpyPackInfo.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/communication/MemcpyPackInfo.h"
 #include "UniformGridGPU_PackInfo.h"
 
 
@@ -36,28 +36,28 @@ public:
           _gpuCommunicationScheme(nullptr), _directScheme(nullptr)
     {
         auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
-        auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
+        auto memcpyPackInfo = make_shared< gpu::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
         auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
         auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );
 
         switch(_commSchemeType)
         {
             case GPUPackInfo_Baseline:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                 _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                 _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                 break;
             case GPUPackInfo_Streams:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                 _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                 _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                 break;
             case UniformGPUScheme_Baseline:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                 _gpuCommunicationScheme->addPackInfo( generatedPackInfo );
                 break;
             case UniformGPUScheme_Memcpy:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                 _gpuCommunicationScheme->addPackInfo( memcpyPackInfo );
                 break;
             case MPIDatatypes:
@@ -151,7 +151,7 @@ public:
 private:
     CommunicationSchemeType _commSchemeType;
     shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme;
-    shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
-    shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
+    shared_ptr< gpu::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
+    shared_ptr< gpu::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
     shared_ptr< blockforest::communication::UniformDirectScheme<StencilType> > _directScheme;
 };
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index 50d9bfd756b4bc5d463ac9848e084945e32da1bd..531ab22d54ab261ad8f159c91e85c5bfde03360d 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -1,12 +1,3 @@
-#!/usr/bin/env python3
-"""
-This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
-Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
- ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
-
-Look at the end of the file to select the benchmark to run
-"""
-
 import os
 import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
@@ -34,6 +25,15 @@ BASE_CONFIG = {
     }
 }
 
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
+
 
 def num_time_steps(block_size, time_steps_for_128_block=200):
     cells = block_size[0] * block_size[1] * block_size[2]
@@ -57,10 +57,16 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
 class Scenario:
     def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1),
                  timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
-                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False,
+                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
+                 init_shear_flow=False, boundary_setup=False,
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1,
                  additional_info=None):
 
-        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+
+        self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses())
 
         self.cells_per_block = cells_per_block
         self.periodic = periodic
@@ -71,11 +77,13 @@ class Scenario:
         self.cuda_enabled_mpi = cuda_enabled_mpi
         self.inner_outer_split = inner_outer_split
         self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
         self.warmup_steps = warmup_steps
         self.outer_iterations = outer_iterations
         self.cuda_blocks = cuda_blocks
 
-        self.vtk_write_frequency = 0
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
 
         self.config_dict = self.config(print_dict=False)
         self.additional_info = additional_info
@@ -88,6 +96,7 @@ class Scenario:
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic,
+                'oneBlockPerProcess': False
             },
             'Parameters': {
                 'omega': self.omega,
@@ -99,9 +108,13 @@ class Scenario:
                 'initShearFlow': self.init_shear_flow,
                 'gpuBlockSize': self.cuda_blocks,
                 'innerOuterSplit': self.inner_outer_split,
-                'vtkWriteFrequency': self.vtk_write_frequency
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
             }
         }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+
         if print_dict:
             wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
             if self.additional_info:
@@ -129,7 +142,7 @@ class Scenario:
         num_tries = 4
         # check multiple times e.g. may fail when multiple benchmark processes are running
         table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
-        table_name = table_name.replace("-", "_")
+        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
         for num_try in range(num_tries):
             try:
                 checkAndUpdateSchema(result, table_name, DB_FILE)
@@ -219,90 +232,30 @@ def single_gpu_benchmark():
             scenarios.add(scenario)
 
 
-# -------------------------------------- Optional job script generation for PizDaint ---------------------------------
-
-
-job_script_header = """
-#!/bin/bash -l
-#SBATCH --job-name=scaling
-#SBATCH --time=01:00:00
-#SBATCH --nodes={nodes}
-#SBATCH -o out_scaling_{nodes}_%j.txt
-#SBATCH -e err_scaling_{nodes}_%j.txt
-#SBATCH --ntasks-per-core=1
-#SBATCH --cpus-per-task=1
-#SBATCH --partition=normal
-#SBATCH --constraint=gpu
-#SBATCH --account=s1042
-
-source ~/env.sh
-
-export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
-export CRAY_CUDA_MPS=1            # allow GPU sharing
-export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
-
-export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-export CRAY_CUDA_MPS=1
-
-export MPICH_RANK_REORDER_METHOD=3
-export PMI_MMAP_SYNC_WAIT_TIME=300
-
-cd {folder}
-# grid_order -R -H -c 1,1,8 -g 16,16,8
-
-ulimit -c 0
-"""
-
-job_script_exe_part = """
-
-export WALBERLA_SCENARIO_IDX=0
-while srun -n {nodes} ./{app} {config}
-do
- ((WALBERLA_SCENARIO_IDX++))
-done
-"""
-
-streaming_patterns = ['pull', 'push', 'aa', 'esotwist']
-stencils = ['d3q27', 'd3q19']
-methods = ['srt', 'mrt', 'cumulant', 'entropic']
-
-all_executables = []
-
-for stencil in stencils:
-    for streaming_pattern in streaming_patterns:
-        for method in methods:
-            all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}")
-
-all_executables = tuple(all_executables)
-
-
-def generate_jobscripts(exe_names=all_executables):
-    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
-        folder_name = "scaling_{:04d}".format(node_count)
-        os.makedirs(folder_name, exist_ok=True)
-
-        # run grid_order
-        import subprocess
-        decomposition = block_decomposition(node_count)
-        decomposition_str = ",".join(str(e) for e in decomposition)
-        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
-
-        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
-        for exe in exe_names:
-            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
-                                                     config='../communication_compare.py')
-
-        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
-            f.write(job_script)
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
 
+    time_step_strategy = "noOverlap"  # "noOverlap"
 
-if __name__ == '__main__':
-    print("Called without waLBerla - generating job scripts for PizDaint")
-    generate_jobscripts()
-else:
-    wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
-    # Select the benchmark you want to run
-    single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
-    # performance of compute kernel (no communication)
-    # overlap_benchmark()  # benchmarks different communication overlap options
-    # profiling()  # run only two timesteps on a smaller domain for profiling only
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(64, 64, 64),
+                        time_step_strategy=time_step_strategy,
+                        timesteps=1000,
+                        outer_iterations=1,
+                        warmup_steps=0,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=0,
+                        remaining_time_logger_frequency=10)
+    scenarios.add(scenario)
+
+
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+# single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+validation_run()
diff --git a/apps/pythonmodule/CMakeLists.txt b/apps/pythonmodule/CMakeLists.txt
index 5ea0decea67007ad01fb8e155a8667fe290dd64f..d2c6251c1e8a9efb7d7918cf620d1f81f01a1769 100644
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
@@ -3,8 +3,8 @@ if( NOT TARGET python_coupling )
     message( WARNING "python module ist not build since the python_coupling target is non-existent" )
 else()
 if ( WALBERLA_BUILD_WITH_PYTHON )
-    if ( WALBERLA_BUILD_WITH_CUDA )
-        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk cuda)
+    if ( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk gpu)
     else()
         set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk)
     endif()
diff --git a/apps/pythonmodule/PythonModule.cpp b/apps/pythonmodule/PythonModule.cpp
index 3059e3f059e2fa110dda1681859693cb6616fc44..11fada5098e505528b4698f33f572a2f29ab586d 100644
--- a/apps/pythonmodule/PythonModule.cpp
+++ b/apps/pythonmodule/PythonModule.cpp
@@ -28,8 +28,8 @@
 
 #include "stencil/all.h"
 
-#ifdef WALBERLA_BUILD_WITH_CUDA
- #include "python_coupling/export/CUDAExport.h"
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+ #include "python_coupling/export/GPUExport.h"
 #endif
 
 
@@ -75,11 +75,11 @@ struct InitObject
       pythonManager->addExporterFunction(blockforest::exportModuleToPython<stencil::D2Q5, stencil::D2Q9, stencil::D3Q7, stencil::D3Q19, stencil::D3Q27>);
       // VTK
       pythonManager->addExporterFunction( vtk::exportModuleToPython );
-      #ifdef WALBERLA_BUILD_WITH_CUDA
-            using walberla::cuda::GPUField;
+      #ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+            using walberla::gpu::GPUField;
 
-            pythonManager->addExporterFunction( cuda::exportModuleToPython<GPU_FIELD_TYPES> );
-            pythonManager->addExporterFunction( cuda::exportCopyFunctionsToPython<FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportModuleToPython<GPU_FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportCopyFunctionsToPython<FIELD_TYPES> );
             pythonManager->addBlockDataConversion<GPU_FIELD_TYPES>();
       #endif
       //
diff --git a/apps/showcases/Antidunes/Antidunes.cpp b/apps/showcases/Antidunes/Antidunes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48113869c8672be163c2c7c6e377d4b5b45f17a3
--- /dev/null
+++ b/apps/showcases/Antidunes/Antidunes.cpp
@@ -0,0 +1,1590 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AntiDunes.cpp
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Jonas Plewinski <jonas.plewinski@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//
+// This showcase simulates antidunes, i.e., particulate dunes that travel in opposite stream-wise direction. See
+// simulation results published in https://doi.org/10.1017/jfm.2023.262.
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/StabilityChecker.h"
+
+#include "lbm/PerformanceLogger.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/free_surface/BlockStateDetectorSweep.h"
+#include "lbm/free_surface/SurfaceMeshWriter.h"
+#include "lbm/free_surface/TotalMassComputer.h"
+#include "lbm/free_surface/VtkWriter.h"
+#include "lbm/free_surface/dynamics/CellConversionSweep.h"
+#include "lbm/free_surface/dynamics/ConversionFlagsResetSweep.h"
+#include "lbm/free_surface/dynamics/ExcessMassDistributionSweep.h"
+#include "lbm/free_surface/dynamics/PdfRefillingSweep.h"
+#include "lbm/free_surface/dynamics/StreamReconstructAdvectSweep.h"
+#include "lbm/free_surface/surface_geometry/CurvatureSweep.h"
+#include "lbm/free_surface/surface_geometry/NormalSweep.h"
+#include "lbm/free_surface/surface_geometry/SmoothingSweep.h"
+#include "lbm/free_surface/surface_geometry/Utility.h"
+
+#include "lbm_mesapd_coupling/mapping/ParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/PdfReconstructionManager.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/Reconstructor.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/ParticleAccessorWithBaseShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ClearGhostOwnerSync.h"
+#include "mesa_pd/mpi/ClearNextNeighborSync.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncGhostOwners.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "vtk/VTKOutput.h"
+
+#include <core/waLBerlaBuildInfo.h>
+
+#include "AntidunesBoundaryHandling.h"
+#include "AntidunesLatticeModel.h"
+#include "PIDController.h"
+#include "Utility.h"
+
+namespace walberla
+{
+namespace antidunes
+{
+using ScalarField_T          = GhostLayerField< real_t, 1 >;
+using VectorField_T          = GhostLayerField< Vector3< real_t >, 1 >;
+using VectorFieldFlattened_T = GhostLayerField< real_t, 3 >;
+
+using LatticeModel_T        = lbm::AntidunesLatticeModel;
+using LatticeModelStencil_T = LatticeModel_T::Stencil;
+using PdfField_T            = lbm::PdfField< LatticeModel_T >;
+using PdfCommunication_T    = blockforest::SimpleCommunication< LatticeModelStencil_T >;
+
+// the geometry computations in SurfaceGeometryHandler require meaningful values in the ghost layers in corner
+// directions (flag field and fill level field); this holds, even if the lattice model uses a D3Q19 stencil
+using CommunicationStencil_T =
+   typename std::conditional< LatticeModel_T::Stencil::D == uint_t(2), stencil::D2Q9, stencil::D3Q27 >::type;
+using Communication_T = blockforest::SimpleCommunication< CommunicationStencil_T >;
+
+using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithBaseShape;
+
+using flag_t      = uint32_t;
+using FlagField_T = FlagField< flag_t >;
+using AntidunesBoundaryHandling_T =
+   free_surface::AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >;
+
+using StateSweep = walberla::free_surface::BlockStateDetectorSweep< FlagField_T >;
+
+const FlagUID FormerMO_Flag("former moving obstacle");
+
+// empty sweep required for using selectors (e.g. StateSweep::fullFreeSurface)
+struct emptySweep
+{
+   void operator()(IBlock*) {}
+};
+
+// data handling for loading a field of type ScalarField_T from file
+template< typename ScalarField_T >
+class ScalarFieldHandling : public field::BlockDataHandling< ScalarField_T >
+{
+ public:
+   ScalarFieldHandling(const weak_ptr< StructuredBlockStorage >& blocks, uint_t numberGhostLayer)
+      : blocks_(blocks), numberGhostLayer_(numberGhostLayer)
+   {}
+
+ protected:
+   ScalarField_T* allocate(IBlock* const block) override { return allocateDispatch(block); }
+
+   ScalarField_T* reallocate(IBlock* const block) override { return allocateDispatch(block); }
+
+ private:
+   weak_ptr< StructuredBlockStorage > blocks_;
+   uint_t numberGhostLayer_;
+
+   ScalarField_T* allocateDispatch(IBlock* const block)
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block);
+
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks);
+
+      return new ScalarField_T(blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                               blocks->getNumberOfZCells(*block), numberGhostLayer_, real_c(0), field::fzyx);
+   }
+}; // class ScalarFieldHandling
+
+// data handling for loading a field of type VectorFieldFlattened_T from file
+template< typename VectorFieldFlattened_T >
+class VectorFieldFlattenedHandling : public field::BlockDataHandling< VectorFieldFlattened_T >
+{
+ public:
+   VectorFieldFlattenedHandling(const weak_ptr< StructuredBlockStorage >& blocks, uint_t numberGhostLayer)
+      : blocks_(blocks), numberGhostLayer_(numberGhostLayer)
+   {}
+
+ protected:
+   VectorFieldFlattened_T* allocate(IBlock* const block) override { return allocateDispatch(block); }
+
+   VectorFieldFlattened_T* reallocate(IBlock* const block) override { return allocateDispatch(block); }
+
+ private:
+   weak_ptr< StructuredBlockStorage > blocks_;
+   uint_t numberGhostLayer_;
+
+   VectorFieldFlattened_T* allocateDispatch(IBlock* const block)
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block);
+
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks);
+
+      return new VectorFieldFlattened_T(blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                                        blocks->getNumberOfZCells(*block), numberGhostLayer_, field::fzyx);
+   }
+}; // class VectorFieldFlattenedHandling
+
+// sweep for computing the force density from the fluid density and fill level
+template< typename LatticeModel_T, typename FlagField_T, typename VectorFieldFlattened_T, typename ScalarField_T >
+class ForceDensityCodegenSweep
+{
+ public:
+   ForceDensityCodegenSweep(BlockDataID forceDensityFieldID, ConstBlockDataID pdfFieldID, ConstBlockDataID flagFieldID,
+                            ConstBlockDataID fillFieldID,
+                            const walberla::free_surface::FlagInfo< FlagField_T >& flagInfo,
+                            std::shared_ptr< Vector3< real_t > > globalAcceleration)
+      : forceDensityFieldID_(forceDensityFieldID), pdfFieldID_(pdfFieldID), flagFieldID_(flagFieldID),
+        fillFieldID_(fillFieldID), flagInfo_(flagInfo), globalAcceleration_(globalAcceleration)
+   {}
+
+   void operator()(IBlock* const block)
+   {
+      VectorFieldFlattened_T* const forceDensityField = block->getData< VectorFieldFlattened_T >(forceDensityFieldID_);
+      const PdfField_T* const pdfField                = block->getData< const PdfField_T >(pdfFieldID_);
+      const FlagField_T* const flagField              = block->getData< const FlagField_T >(flagFieldID_);
+      const ScalarField_T* const fillField            = block->getData< const ScalarField_T >(fillFieldID_);
+
+      WALBERLA_FOR_ALL_CELLS(forceDensityFieldIt, forceDensityField, pdfFieldIt, pdfField, flagFieldIt, flagField,
+                             fillFieldIt, fillField, {
+                                flag_t flag = *flagFieldIt;
+
+                                // set force density in cells to acceleration * density * fillLevel (see equation 15
+                                // in Koerner et al., 2005);
+                                if (flagInfo_.isInterface(flag))
+                                {
+                                   const real_t density   = pdfField->getDensity(pdfFieldIt.cell());
+                                   forceDensityFieldIt[0] = (*globalAcceleration_)[0] * *fillFieldIt * density;
+                                   forceDensityFieldIt[1] = (*globalAcceleration_)[1] * *fillFieldIt * density;
+                                   forceDensityFieldIt[2] = (*globalAcceleration_)[2] * *fillFieldIt * density;
+                                }
+                                else
+                                {
+                                   if (flagInfo_.isLiquid(flag))
+                                   {
+                                      const real_t density   = pdfField->getDensity(pdfFieldIt.cell());
+                                      forceDensityFieldIt[0] = (*globalAcceleration_)[0] * density;
+                                      forceDensityFieldIt[1] = (*globalAcceleration_)[1] * density;
+                                      forceDensityFieldIt[2] = (*globalAcceleration_)[2] * density;
+                                   }
+                                }
+                             }) // WALBERLA_FOR_ALL_CELLS
+   }
+
+ private:
+   using flag_t = typename FlagField_T::flag_t;
+
+   BlockDataID forceDensityFieldID_;
+   ConstBlockDataID pdfFieldID_;
+   ConstBlockDataID flagFieldID_;
+   ConstBlockDataID fillFieldID_;
+   walberla::free_surface::FlagInfo< FlagField_T > flagInfo_;
+   std::shared_ptr< Vector3< real_t > > globalAcceleration_;
+}; // class ForceDensitySweep
+
+// function describing the global initialization profile
+inline real_t initializationProfile(real_t x, real_t amplitude, real_t offset, real_t wavelength)
+{
+   return amplitude * std::cos(x / wavelength * real_c(2) * math::pi + math::pi) + offset;
+}
+
+real_t getHydrostaticDensity(real_t height, real_t referenceHeight, real_t gravitationalAcceleration)
+{
+   return real_c(1) + real_c(3) * gravitationalAcceleration * (height - referenceHeight);
+}
+
+void initializePoiseuilleProfile(StructuredBlockForest& forest, const BlockDataID& pdfFieldID,
+                                 const ConstBlockDataID& fillFieldID, const real_t& averageBedHeight,
+                                 const real_t& averageFluidHeight, const real_t& accelerationX, const real_t& viscosity,
+                                 real_t amplitude, real_t wavelength)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Initializing Poiseuille velocity profile");
+
+   const real_t rho = real_c(1);
+
+   for (auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt)
+   {
+      PdfField_T* const pdfField           = blockIt->getData< PdfField_T >(pdfFieldID);
+      const ScalarField_T* const fillField = blockIt->getData< const ScalarField_T >(fillFieldID);
+
+      WALBERLA_FOR_ALL_CELLS_XYZ(
+         pdfField, const Vector3< real_t > coord = forest.getBlockLocalCellCenter(*blockIt, Cell(x, y, z));
+
+         Vector3< real_t > velocity(real_c(0));
+
+         auto localBedHeight = initializationProfile(coord[0], amplitude, averageBedHeight, wavelength);
+         auto heightAboveBed = coord[2] - localBedHeight;
+
+         const real_t fillLevel = fillField->get(x, y, z);
+
+         if (heightAboveBed >= real_c(0) && fillLevel > real_c(0)) {
+            velocity[0] = accelerationX / (real_c(2) * viscosity) * heightAboveBed *
+                          (real_c(2) * averageFluidHeight - heightAboveBed);
+         } pdfField->setToEquilibrium(x, y, z, velocity, rho);)
+   }
+}
+
+/***********************************************************************************************************************
+ * Initialize the hydrostatic pressure in the direction in which a force is acting in ALL cells (regardless of a cell's
+ * flag). The velocity remains unchanged.
+ *
+ * The force vector must have only one component, i.e., the direction of the force can only be in x-, y- or z-axis.
+ * The variable fluidHeight determines the height at which the density is equal to reference density (=1).
+ **********************************************************************************************************************/
+template< typename PdfField_T >
+void initHydrostaticPressure(const std::weak_ptr< StructuredBlockForest >& blockForestPtr,
+                             const BlockDataID& pdfFieldID,
+                             std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Initializing hydrostatic pressure");
+
+   const auto blockForest = blockForestPtr.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(blockForest);
+
+   for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+   {
+      PdfField_T* const pdfField = blockIt->getData< PdfField_T >(pdfFieldID);
+
+      CellInterval local = pdfField->xyzSizeWithGhostLayer(); // block-, i.e., process-local cell interval
+
+      for (auto cellIt = local.begin(); cellIt != local.end(); ++cellIt)
+      {
+         // initialize the (hydrostatic) pressure, i.e., LBM density
+         // Bernoulli: p = p0 + density * gravity * height
+         // => LBM (density=1): rho = rho0 + gravity * height = 1 + 1/cs^2 * g * h = 1 + 3 * g * h
+         // shift global cell by 0.5 since density is set for cell center
+
+         Vector3< real_t > cellCenter = blockForest->getBlockLocalCellCenter(*blockIt, *cellIt);
+         const real_t rho             = hydrostaticDensityFct(cellCenter);
+
+         const Vector3< real_t > velocity = pdfField->getVelocity(*cellIt);
+
+         pdfField->setDensityAndVelocity(*cellIt, velocity, rho);
+      }
+   }
+}
+
+template< typename FreeSurfaceBoundaryHandling_T, typename PdfField_T, typename FlagField_T >
+class MeanVelocityComputer
+{
+ public:
+   MeanVelocityComputer(const std::weak_ptr< const StructuredBlockForest >& blockForest,
+                        const std::weak_ptr< const FreeSurfaceBoundaryHandling_T >& freeSurfaceBoundaryHandling,
+                        const ConstBlockDataID& pdfFieldID, const std::shared_ptr< Vector3< real_t > >& meanVelocity,
+                        real_t averagingFactor)
+      : blockForest_(blockForest), freeSurfaceBoundaryHandling_(freeSurfaceBoundaryHandling), pdfFieldID_(pdfFieldID),
+        meanVelocity_(meanVelocity), averagingFactor_(averagingFactor)
+   {}
+
+   void operator()()
+   {
+      auto blockForest = blockForest_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blockForest);
+
+      auto freeSurfaceBoundaryHandling = freeSurfaceBoundaryHandling_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(freeSurfaceBoundaryHandling);
+
+      getMeanVelocity(blockForest, freeSurfaceBoundaryHandling);
+   }
+
+   void getMeanVelocity(const std::shared_ptr< const StructuredBlockForest >& blockForest,
+                        const std::shared_ptr< const FreeSurfaceBoundaryHandling_T >& freeSurfaceBoundaryHandling)
+   {
+      const BlockDataID flagFieldID = freeSurfaceBoundaryHandling->getFlagFieldID();
+      const typename FreeSurfaceBoundaryHandling_T::FlagInfo_T& flagInfo = freeSurfaceBoundaryHandling->getFlagInfo();
+
+      // use separate variables for the velocity in each direction; syntax meanVelocity[0] does not work in OMP-macro
+      real_t meanVelocityX = real_c(0);
+      real_t meanVelocityY = real_c(0);
+      real_t meanVelocityZ = real_c(0);
+
+      for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+      {
+         const FlagField_T* const flagField = blockIt->template getData< const FlagField_T >(flagFieldID);
+         const PdfField_T* const pdfField   = blockIt->template getData< const PdfField_T >(pdfFieldID_);
+
+         WALBERLA_FOR_ALL_CELLS_OMP(flagFieldIt, flagField, pdfFieldIt, pdfField,
+                                    omp parallel for schedule(static) reduction(+:meanVelocityX)
+                                       reduction(+:meanVelocityY) reduction(+:meanVelocityZ),
+                                    {
+            if (flagInfo.isLiquid(flagFieldIt) || flagInfo.isInterface(flagFieldIt))
+            {
+               const Vector3< real_t > velocity = pdfField->getVelocity(pdfFieldIt.cell());
+
+               meanVelocityX += velocity[0];
+               meanVelocityY += velocity[1];
+               meanVelocityZ += velocity[2];
+
+               //++cellCount;
+            }
+                                    }) // WALBERLA_FOR_ALL_CELLS_OMP
+      }
+
+      Vector3< real_t > meanVelocity(meanVelocityX, meanVelocityY, meanVelocityZ);
+      mpi::allReduceInplace< real_t >(meanVelocity, mpi::SUM);
+      // mpi::allReduceInplace< uint_t >(cellCount, mpi::SUM);
+
+      meanVelocity *= averagingFactor_;
+      *meanVelocity_ = meanVelocity;
+   };
+
+ private:
+   std::weak_ptr< const StructuredBlockForest > blockForest_;
+   std::weak_ptr< const FreeSurfaceBoundaryHandling_T > freeSurfaceBoundaryHandling_;
+
+   const ConstBlockDataID pdfFieldID_;
+
+   std::shared_ptr< Vector3< real_t > > meanVelocity_;
+   real_t averagingFactor_;
+}; // class MeanVelocityComputer
+
+class ForcingAdjuster
+{
+ public:
+   ForcingAdjuster(const shared_ptr< StructuredBlockStorage >& blocks, real_t targetVelocity, real_t externalForcing,
+                   real_t proportionalGain, real_t derivativeGain, real_t integralGain, real_t maxRamp,
+                   real_t minActuatingVariable, real_t maxActuatingVariable)
+      : blocks_(blocks), currentExternalForcing_(externalForcing),
+        pid_(targetVelocity, externalForcing, proportionalGain, derivativeGain, integralGain, maxRamp,
+             minActuatingVariable, maxActuatingVariable)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Creating PID controller with pg = " << pid_.getProportionalGain()
+                                                                     << ", dg = " << pid_.getDerivateGain()
+                                                                     << ", ig = " << pid_.getIntegralGain());
+   }
+
+   void operator()(const real_t currentMeanVelocity)
+   {
+      // compute new forcing value on root (since flow rate only known on root)
+      WALBERLA_ROOT_SECTION()
+      {
+         real_t newExternalForcing = pid_.update(currentMeanVelocity);
+         currentExternalForcing_   = newExternalForcing;
+      }
+
+      // send updated external forcing to all other processes
+      mpi::broadcastObject(currentExternalForcing_);
+   }
+
+   real_t getExternalForcing() { return currentExternalForcing_; }
+   void storePIDSnapshot(std::string filename)
+   {
+      WALBERLA_ROOT_SECTION() { pid_.writeStateToFile(filename); }
+   }
+   void loadPIDSnapshot(std::string filename) { pid_.readStateFromFile(filename); }
+
+ private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+
+   real_t currentExternalForcing_;
+   PIDController pid_;
+}; // ForcingAdjuster
+
+int main(int argc, char** argv)
+{
+   Environment walberlaEnv(argc, argv);
+
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla Revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8))
+
+   if (argc < 2) { WALBERLA_ABORT("Please specify a parameter file as input argument.") }
+
+   WALBERLA_LOG_DEVEL_ON_ROOT("Using generated lattice model.");
+   auto configPtr = walberlaEnv.config();
+
+   // print content of parameter file
+   WALBERLA_LOG_INFO_ON_ROOT(*configPtr);
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream file;
+      file.open("parameterConfiguration.cfg");
+      file << *configPtr;
+      file.close();
+   }
+
+   // get block forest parameters from parameter file
+   auto blockForestParameters            = configPtr->getOneBlock("BlockForestParameters");
+   const Vector3< uint_t > cellsPerBlock = blockForestParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   const Vector3< bool > periodicity     = blockForestParameters.getParameter< Vector3< bool > >("periodicity");
+   const bool loadSnapshot               = blockForestParameters.getParameter< bool >("loadSnapshot");
+   const bool storeSnapshot              = blockForestParameters.getParameter< bool >("storeSnapshot");
+   const uint_t snapshotFrequency        = blockForestParameters.getParameter< uint_t >("snapshotFrequency");
+   const std::string snapshotBaseFolder  = blockForestParameters.getParameter< std::string >("snapshotBaseFolder");
+
+   // get domain parameters from parameter file
+   auto domainParameters              = configPtr->getOneBlock("DomainParameters");
+   const Vector3< uint_t > domainSize = domainParameters.getParameter< Vector3< uint_t > >("domainSize");
+   const uint_t wavePeriods           = domainParameters.getParameter< uint_t >("wavePeriods");
+   const real_t liquidHeightFactor    = domainParameters.getParameter< real_t >("liquidHeightFactor");
+   const real_t floorHeightFactor     = domainParameters.getParameter< real_t >("floorHeightFactor");
+   const real_t initialAmplitude      = domainParameters.getParameter< real_t >("initialAmplitude");
+
+   // compute number of blocks as defined by domainSize and cellsPerBlock
+   Vector3< uint_t > numBlocks;
+   for (uint_t i = uint_c(0); i <= uint_c(2); ++i)
+   {
+      numBlocks[i] = domainSize[i] / cellsPerBlock[i];
+      WALBERLA_CHECK_EQUAL(numBlocks[i] * cellsPerBlock[i], domainSize[i],
+                           "Domain size in direction " << i << " is not a multiple of cells per block.")
+   }
+
+   // get number of (MPI) processes
+   const uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   WALBERLA_CHECK_EQUAL(numProcesses, numBlocks[0] * numBlocks[1] * numBlocks[2],
+                        "The number of MPI processes is different from the number of blocks as defined by "
+                        "\"domainSize/cellsPerBlock\".")
+
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numProcesses);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numBlocks);
+
+   // get PID controller parameters
+   auto PIDParameters                       = configPtr->getOneBlock("PIDParameters");
+   const real_t targetMeanVelocityMagnitude = PIDParameters.getParameter< real_t >("targetMeanVelocityMagnitude");
+   const real_t proportionalGain            = PIDParameters.getParameter< real_t >("proportionalGain");
+   const real_t derivativeGain              = PIDParameters.getParameter< real_t >("derivativeGain");
+   const real_t integralGain                = PIDParameters.getParameter< real_t >("integralGain");
+   const real_t maxRamp                     = PIDParameters.getParameter< real_t >("maxRamp");
+   const real_t minActuatingVariable        = PIDParameters.getParameter< real_t >("minActuatingVariable");
+   const real_t maxActuatingVariable        = PIDParameters.getParameter< real_t >("maxActuatingVariable");
+
+   // read particle infos
+   const auto particleParameters               = configPtr->getOneBlock("ParticleParameters");
+   const std::string particleInFileName        = particleParameters.getParameter< std::string >("inFileName");
+   const uint_t bedCopiesInX                   = particleParameters.getParameter< uint_t >("bedCopiesInX");
+   const uint_t bedCopiesInY                   = particleParameters.getParameter< uint_t >("bedCopiesInY");
+   const real_t particleDensityRatio           = particleParameters.getParameter< real_t >("densityRatio");
+   const real_t particleFixingHeightFactor     = particleParameters.getParameter< real_t >("fixingHeightFactor");
+   const real_t particleFrictionCoefficient    = particleParameters.getParameter< real_t >("frictionCoefficient");
+   const real_t particleRestitutionCoefficient = particleParameters.getParameter< real_t >("restitutionCoefficient");
+   const uint_t particleNumSubCycles           = particleParameters.getParameter< uint_t >("numSubCycles");
+   const bool useLubricationCorrection         = particleParameters.getParameter< bool >("useLubricationCorrection");
+   const bool useNoSlipParticles               = particleParameters.getParameter< bool >("useNoSlipParticles");
+   const real_t particlePoissonsRatio          = real_c(0.22);
+   const real_t particleKappa = real_c(2) * (real_c(1) - particlePoissonsRatio) / (real_c(2) - particlePoissonsRatio);
+   real_t particleCollisionTimeNonDim = real_c(4);
+   bool useOpenMP                     = false;
+   const uint_t vtkSpacingParticles =
+      configPtr->getOneBlock("VTK").getOneBlock("fluid_field").getParameter< uint_t >("writeFrequency");
+   const std::string vtkFolder =
+      configPtr->getOneBlock("VTK").getOneBlock("fluid_field").getParameter< std::string >("baseFolder");
+
+   // get physics parameters from parameter file
+   auto physicsParameters   = configPtr->getOneBlock("PhysicsParameters");
+   const bool enableWetting = physicsParameters.getParameter< bool >("enableWetting");
+   const uint_t timesteps   = physicsParameters.getParameter< uint_t >("timesteps");
+   const real_t Re          = physicsParameters.getParameter< real_t >("Re");
+   const real_t Fr          = physicsParameters.getParameter< real_t >("Fr");
+   const real_t We          = physicsParameters.getParameter< real_t >("We");
+
+   // get avgDiameter and scaling factor
+   real_t avgParticleDiameter   = real_c(0);
+   real_t particleScalingFactor = real_c(0);
+   getAvgDiameterScalingFactor(particleInFileName, domainSize, bedCopiesInX, bedCopiesInY, avgParticleDiameter,
+                               particleScalingFactor);
+   const real_t liquidHeight         = avgParticleDiameter * liquidHeightFactor;
+   const real_t floorHeight          = avgParticleDiameter * floorHeightFactor;
+   const real_t particleFixingHeight = particleFixingHeightFactor * avgParticleDiameter;
+
+   WALBERLA_CHECK_FLOAT_UNEQUAL(liquidHeight, 0.0)
+   WALBERLA_CHECK_FLOAT_UNEQUAL(floorHeight, 0.0)
+
+   const real_t absoluteLiquidHeight = liquidHeight + floorHeight;
+
+   const real_t viscosity      = targetMeanVelocityMagnitude * liquidHeight / Re;
+   const real_t relaxationRate = real_c(1.0) / (real_c(3) * viscosity + real_c(0.5));
+
+   const real_t gravity       = (targetMeanVelocityMagnitude / Fr) * (targetMeanVelocityMagnitude / Fr) / liquidHeight;
+   const real_t accelerationX = real_c(3) * targetMeanVelocityMagnitude * viscosity / (liquidHeight * liquidHeight);
+   std::shared_ptr< Vector3< real_t > > acceleration =
+      std::make_shared< Vector3< real_t > >(accelerationX, real_c(0.0), -gravity);
+
+   const real_t surfaceTension =
+      real_c(1.0) * targetMeanVelocityMagnitude * targetMeanVelocityMagnitude * liquidHeight / We;
+
+   // compute SI dx and dt
+   const real_t viscosity_SI = real_c(1.0016e-6); // kinemtic viscosity of water at 20°C at 1 bar
+   const real_t dx_SI        = real_c(1) / particleScalingFactor;
+   const real_t dt_SI        = viscosity / viscosity_SI * dx_SI * dx_SI;
+
+   WALBERLA_LOG_INFO_ON_ROOT("\nPhysical parameters:")
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(liquidHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(floorHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(dx_SI);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(dt_SI);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(*acceleration);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(relaxationRate);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(viscosity);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(absoluteLiquidHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(avgParticleDiameter);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(particleScalingFactor);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(particleFixingHeight);
+   WALBERLA_LOG_INFO_ON_ROOT("\nFree surface physical parameters")
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(surfaceTension);
+
+   if ((periodicity[0] && numBlocks[0] < uint_c(3)) || (periodicity[1] && numBlocks[1] < uint_c(3)) ||
+       (periodicity[2] && numBlocks[2] < uint_c(3)))
+   {
+      WALBERLA_ABORT("When using particles, use at least three blocks per periodic direction.");
+   }
+
+   // read model parameters from parameter file
+   const auto modelParameters               = configPtr->getOneBlock("ModelParameters");
+   const std::string pdfReconstructionModel = modelParameters.getParameter< std::string >("pdfReconstructionModel");
+   const std::string pdfRefillingModel      = modelParameters.getParameter< std::string >("pdfRefillingModel");
+   const std::string excessMassDistributionModel =
+      modelParameters.getParameter< std::string >("excessMassDistributionModel");
+   const walberla::free_surface::ExcessMassDistributionModel excessMassModel(excessMassDistributionModel);
+   const std::string curvatureModel          = modelParameters.getParameter< std::string >("curvatureModel");
+   const bool useSimpleMassExchange          = modelParameters.getParameter< bool >("useSimpleMassExchange");
+   const real_t cellConversionThreshold      = modelParameters.getParameter< real_t >("cellConversionThreshold");
+   const real_t cellConversionForceThreshold = modelParameters.getParameter< real_t >("cellConversionForceThreshold");
+
+   // read evaluation parameters from parameter file
+   const auto evaluationParameters      = configPtr->getOneBlock("EvaluationParameters");
+   const uint_t performanceLogFrequency = evaluationParameters.getParameter< uint_t >("performanceLogFrequency");
+   const uint_t evaluationFrequency     = evaluationParameters.getParameter< uint_t >("evaluationFrequency");
+   const std::string baseFolderName     = evaluationParameters.getParameter< std::string >("baseFolderName");
+
+   uint_t beginTimeStep = 0;
+   const std::string checkpointConfigFile("antidunesCheckpointConfig.file");
+   if (loadSnapshot)
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ifstream file;
+         file.open(snapshotBaseFolder + "/" + checkpointConfigFile);
+         if (file.fail()) WALBERLA_ABORT("Error: " << checkpointConfigFile << " could not be opened!");
+         file >> beginTimeStep;
+         file >> (*acceleration)[0];
+         file.close();
+      }
+      mpi::broadcastObject(beginTimeStep);
+      mpi::broadcastObject(*acceleration);
+
+      WALBERLA_LOG_INFO_ON_ROOT("Successfully read config parameters from checkpoint config file:")
+      WALBERLA_LOG_INFO_ON_ROOT(" - beginTimeStep = " << beginTimeStep)
+      WALBERLA_LOG_INFO_ON_ROOT(" - acceleration = < " << (*acceleration)[0] << ", " << (*acceleration)[1] << ", "
+                                                       << (*acceleration)[2] << " >")
+   }
+
+   if (loadSnapshot)
+   {
+      // modify config file to start VTK output from "loadFromTimestep" rather than from 0
+      std::vector< config::Config::Block* > configVTKBlock;
+      configPtr->getWritableGlobalBlock().getWritableBlocks("VTK", configVTKBlock, 1, 1);
+      std::vector< config::Config::Block* > configVTKFluidFieldBlock;
+      configVTKBlock[0]->getWritableBlocks("fluid_field", configVTKFluidFieldBlock, 1, 1);
+      configVTKFluidFieldBlock[0]->setOrAddParameter("initialExecutionCount", std::to_string(beginTimeStep));
+   }
+
+   WALBERLA_ROOT_SECTION()
+   {
+      // create base directories if they do not yet exist
+      filesystem::path tpath(baseFolderName);
+      if (!filesystem::exists(tpath)) filesystem::create_directory(tpath);
+
+      filesystem::path snapshotPath(snapshotBaseFolder);
+      if (!filesystem::exists(snapshotPath)) filesystem::create_directory(snapshotPath);
+   }
+
+   std::shared_ptr< StructuredBlockForest > blockForest(nullptr);
+   const std::string blockForestFile("blockForest.file");
+
+   if (loadSnapshot)
+   {
+      // load block forest from file
+      MPIManager::instance()->useWorldComm();
+
+      blockForest = make_shared< StructuredBlockForest >(
+         make_shared< BlockForest >(uint_c(MPIManager::instance()->rank()),
+                                    (std::string(snapshotBaseFolder + "/" + blockForestFile)).c_str(), true, false),
+         cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blockForest->createCellBoundingBoxes();
+   }
+   else
+   {
+      // create uniform block forest
+      blockForest = blockforest::createUniformBlockGrid(numBlocks[0], numBlocks[1], numBlocks[2],             // blocks
+                                                        cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2], // cells
+                                                        real_c(1.0),                                          // dx
+                                                        true, // one block per process
+                                                        periodicity[0], periodicity[1], periodicity[2]); // periodicity
+   }
+
+   // save block forest to file (but do not overwrite existing file if snapshot is loaded)
+   if (storeSnapshot && !loadSnapshot)
+   {
+      blockForest->getBlockForest().saveToFile(snapshotBaseFolder + "/" + blockForestFile);
+   }
+
+   const auto vtkParameters      = configPtr->getOneBlock("VTK");
+   const auto vtkFluidParameters = vtkParameters.getOneBlock("fluid_field");
+
+   BlockDataID pdfFieldID;
+   const std::string pdfFieldFile("pdfField.file");
+   BlockDataID fillFieldID;
+   const std::string fillFieldFile("fillField.file");
+   BlockDataID excessMassFieldID;
+   const std::string excessMassFieldFile("excessMassField.file");
+
+   // force density field must be added here to create lattice model
+   BlockDataID forceDensityFieldID =
+      field::addToStorage< VectorFieldFlattened_T >(blockForest, "Force field", real_c(0), field::fzyx, uint_c(1));
+
+   if (excessMassModel.isEvenlyAllInterfaceFallbackLiquidType())
+   {
+      // add additional field for storing excess mass in liquid cells
+      excessMassFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Excess mass", real_c(0), field::fzyx, uint_c(1));
+   }
+
+   LatticeModel_T latticeModel = LatticeModel_T(forceDensityFieldID, relaxationRate);
+
+   if (loadSnapshot)
+   {
+      // load PDF field from file
+      shared_ptr< lbm::internal::PdfFieldHandling< LatticeModel_T > > pdfFieldDataHandling =
+         make_shared< lbm::internal::PdfFieldHandling< LatticeModel_T > >(
+            blockForest, latticeModel, false, Vector3< real_t >(real_c(0)), real_c(1), uint_c(1), field::fzyx);
+      pdfFieldID = (blockForest->getBlockStorage())
+                      .loadBlockData(snapshotBaseFolder + "/" + pdfFieldFile, pdfFieldDataHandling, "PDF field");
+
+      // load fill level field from file
+      std::shared_ptr< ScalarFieldHandling< ScalarField_T > > fillFieldDataHandling =
+         std::make_shared< ScalarFieldHandling< ScalarField_T > >(blockForest, uint_c(2));
+      fillFieldID =
+         (blockForest->getBlockStorage())
+            .loadBlockData(snapshotBaseFolder + "/" + fillFieldFile, fillFieldDataHandling, "Fill level field");
+
+      // load fill level field from file
+      std::shared_ptr< ScalarFieldHandling< ScalarField_T > > excessMassFieldDataHandling =
+         std::make_shared< ScalarFieldHandling< ScalarField_T > >(blockForest, uint_c(1));
+      excessMassFieldID = (blockForest->getBlockStorage())
+                             .loadBlockData(snapshotBaseFolder + "/" + excessMassFieldFile, excessMassFieldDataHandling,
+                                            "Excess mass field");
+   }
+   else
+   {
+      // add PDF field
+      pdfFieldID =
+         lbm::addPdfFieldToStorage(blockForest, "PDF field", latticeModel,
+                                   Vector3< real_t >(targetMeanVelocityMagnitude, 0, 0), real_c(1.0), field::fzyx);
+
+      // add fill level field (initialized with 0, i.e., gas everywhere)
+      fillFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Fill level field", real_c(0.0), field::fzyx, uint_c(2));
+
+      // add fill level field (initialized with 0, i.e., gas everywhere)
+      excessMassFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Excess mass field", real_c(0.0), field::fzyx, uint_c(1));
+   }
+
+   // MesaPD data structures
+   auto particleStorage  = std::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto particleAccessor = std::make_shared< mesa_pd::data::ParticleAccessorWithBaseShape >(particleStorage);
+   auto mesapdDomain     = std::make_shared< mesa_pd::domain::BlockForestDomain >(blockForest->getBlockForestPointer());
+
+   BlockDataID particleStorageID;
+   const std::string particleStorageFile("particleStorageFile.file");
+   if (loadSnapshot)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Initializing particles from checkpointing file!");
+      particleStorageID = blockForest->loadBlockData(snapshotBaseFolder + "/" + particleStorageFile,
+                                                     mesa_pd::domain::createBlockForestDataHandling(particleStorage),
+                                                     "Particle Storage");
+      mesa_pd::mpi::ClearNextNeighborSync CNNS;
+      CNNS(*particleAccessor);
+
+      mesa_pd::mpi::ClearGhostOwnerSync CGOS;
+      CGOS(*particleAccessor);
+   }
+   else
+   {
+      particleStorageID =
+         blockForest->addBlockData(mesa_pd::domain::createBlockForestDataHandling(particleStorage), "Particle Storage");
+   }
+
+   BlockDataID particleFieldID = field::addToStorage< lbm_mesapd_coupling::ParticleField_T >(
+      blockForest, "Particle field", particleAccessor->getInvalidUid(), field::fzyx, uint_c(2));
+
+   auto densityReferenceHeight = absoluteLiquidHeight;
+   auto hydrostaticDensityFct  = [acceleration, densityReferenceHeight](const Vector3< real_t >& position) {
+      uint_t forceComponent = uint_c(2); // gravity is here strictly only acting in z direction!
+      return getHydrostaticDensity(position[forceComponent], densityReferenceHeight, (*acceleration)[forceComponent]);
+   };
+
+   // add boundary handling
+   const std::shared_ptr< AntidunesBoundaryHandling_T > antidunesBoundaryHandling =
+      std::make_shared< AntidunesBoundaryHandling_T >(blockForest, pdfFieldID, fillFieldID, particleFieldID,
+                                                      particleAccessor, hydrostaticDensityFct);
+   const BlockDataID flagFieldID                                    = antidunesBoundaryHandling->getFlagFieldID();
+   const typename AntidunesBoundaryHandling_T::FlagInfo_T& flagInfo = antidunesBoundaryHandling->getFlagInfo();
+
+   real_t sinusAmplitude  = real_c(0.5) * initialAmplitude;
+   real_t sinusOffset     = floorHeight;
+   real_t sinusWavelength = real_c(domainSize[0]) / real_c(wavePeriods);
+
+   if (!loadSnapshot)
+   {
+      // samples used in the Monte-Carlo like estimation of the fill level
+      const uint_t fillLevelInitSamples = uint_c(100); // actually there will be 101 since 0 is also included
+
+      const uint_t numTotalPoints = (fillLevelInitSamples + uint_c(1)) * (fillLevelInitSamples + uint_c(1));
+      const real_t stepsize       = real_c(1) / real_c(fillLevelInitSamples);
+
+      // initialize free-surface sine profile
+      for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+      {
+         ScalarField_T* const fillField = blockIt->getData< ScalarField_T >(fillFieldID);
+
+         WALBERLA_FOR_ALL_CELLS(fillFieldIt, fillField, {
+            // cell in block-local coordinates
+            const Cell localCell = fillFieldIt.cell();
+
+            // get cell in global coordinates
+            Cell globalCell = localCell;
+            blockForest->transformBlockLocalToGlobalCell(globalCell, *blockIt, localCell);
+
+            // Monte-Carlo like estimation of the fill level:
+            // create uniformly-distributed sample points in each cell and count the number of points below the sine
+            // profile; this fraction of points is used as the fill level to initialize the profile
+            uint_t numPointsBelow = uint_c(0);
+
+            for (uint_t xSample = uint_c(0); xSample <= fillLevelInitSamples; ++xSample)
+            {
+               // Pascal et al. (2021) defined the amplitude to span from minimum peak to maximum peak; in
+               // initializationProfile(), the amplitude is defined to range from the average to the maximum peak
+               const real_t functionValue =
+                  initializationProfile(real_c(globalCell[0]) + real_c(xSample) * stepsize, sinusAmplitude,
+                                        absoluteLiquidHeight + real_c(0.5), sinusWavelength);
+
+               for (uint_t zSample = uint_c(0); zSample <= fillLevelInitSamples; ++zSample)
+               {
+                  const real_t zPoint = real_c(globalCell[2]) + real_c(zSample) * stepsize;
+                  // with operator <, a fill level of 1 can not be reached when the line is equal to the cell's top
+                  // border; with operator <=, a fill level of 0 can not be reached when the line is equal to the cell's
+                  // bottom border
+                  if (zPoint < functionValue) { ++numPointsBelow; }
+               }
+            }
+
+            // fill level is fraction of points below sine profile
+            fillField->get(localCell) = real_c(numPointsBelow) / real_c(numTotalPoints);
+         }) // WALBERLA_FOR_ALL_CELLS
+      }
+
+      initializePoiseuilleProfile(*blockForest, pdfFieldID, fillFieldID, floorHeight, liquidHeight + real_c(0.5),
+                                  (*acceleration)[0], viscosity, sinusAmplitude, sinusWavelength);
+   }
+
+   // initialize domain boundary conditions from config file
+   const auto boundaryParameters = configPtr->getOneBlock("BoundaryParameters");
+   antidunesBoundaryHandling->initFromConfig(boundaryParameters);
+
+   std::function< void(void) > syncCall;
+   auto simulationDomainAABB = blockForest->getDomain();
+
+   lbm_mesapd_coupling::ParticleMappingKernel< AntidunesBoundaryHandling_T::BoundaryHandling_T > particleMappingKernel(
+      blockForest, antidunesBoundaryHandling->getHandlingID());
+   lbm_mesapd_coupling::MovingParticleMappingKernel< AntidunesBoundaryHandling_T::BoundaryHandling_T >
+      movingParticleMappingKernel(blockForest, antidunesBoundaryHandling->getHandlingID(), particleFieldID);
+
+   uint_t numParticles = uint_c(0);
+   // initialize bottom solid sine profile
+   if (!loadSnapshot)
+   {
+      auto createParticleFct = [sinusAmplitude, sinusOffset, sinusWavelength](Vector3< real_t > pos) {
+         return pos[2] < initializationProfile(pos[0], sinusAmplitude, sinusOffset, sinusWavelength);
+      };
+
+      real_t maxParticleHeight = real_c(0);
+      initSpheresFromFile(particleInFileName, *particleStorage, *mesapdDomain, particleDensityRatio, domainSize,
+                          createParticleFct, simulationDomainAABB, bedCopiesInX, bedCopiesInY, numParticles,
+                          maxParticleHeight, particleScalingFactor);
+      WALBERLA_LOG_INFO_ON_ROOT("Max particle height " << maxParticleHeight);
+      if ((sinusOffset + sinusAmplitude) > maxParticleHeight)
+         WALBERLA_ABORT("Created particle bed is below desired sinus shape!");
+      if (real_c(2) * sinusAmplitude > (maxParticleHeight - particleFixingHeight))
+         WALBERLA_ABORT("Created mobile particle bed is not high enough for desired sinus shape!");
+      if (useNoSlipParticles && (particleFixingHeight < maxParticleHeight))
+         WALBERLA_ABORT("You are using no-slip BCs on particles (which does not set hydrodynamic forces) but do not "
+                        "fix all particles - this leads to wrong behavior and is not permitted!")
+
+      // fix lower particles
+      particleStorage->forEachParticle(
+         useOpenMP, mesa_pd::kernel::SelectAll(), *particleAccessor,
+         [particleFixingHeight](const size_t idx, auto& ac) {
+            if (ac.getPosition(idx)[2] < particleFixingHeight)
+               mesa_pd::data::particle_flags::set(ac.getFlagsRef(idx), mesa_pd::data::particle_flags::FIXED);
+         },
+         *particleAccessor);
+   }
+   else
+   {
+      real_t avgParticleDiameterTest = real_c(0);
+      particleStorage->forEachParticle(
+         false, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+         [&numParticles, &avgParticleDiameterTest](const size_t idx, auto& ac) {
+            auto sp = static_cast< mesa_pd::data::Sphere* >(ac.getBaseShape(idx).get());
+            ++numParticles;
+            avgParticleDiameterTest += real_c(2) * sp->getRadius();
+         },
+         *particleAccessor);
+      mpi::allReduceInplace(numParticles, mpi::SUM);
+      mpi::allReduceInplace(avgParticleDiameterTest, mpi::SUM);
+      avgParticleDiameterTest /= real_c(numParticles);
+      WALBERLA_LOG_INFO_ON_ROOT("Read particles from check pointing file with avg diameter of "
+                                << avgParticleDiameterTest)
+      if (std::abs(avgParticleDiameterTest - avgParticleDiameter) / avgParticleDiameterTest > real_c(0.05))
+      {
+         WALBERLA_ABORT("Particle diameters not correct.")
+      }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("Created " << numParticles << " particles");
+
+   // create planes
+   createPlane(*particleStorage, simulationDomainAABB.minCorner(), Vector3< real_t >(real_c(0), real_c(0), real_c(1)));
+   createPlane(*particleStorage, simulationDomainAABB.maxCorner(), Vector3< real_t >(real_c(0), real_c(0), real_c(-1)));
+
+   const real_t blockSyncExtension    = real_c(2.5);
+   real_t maxPossibleParticleDiameter = avgParticleDiameter * real_c(1.1);
+   if (maxPossibleParticleDiameter < real_c(2) * real_c(cellsPerBlock.min()) - blockSyncExtension)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Using next neighbor sync for particles");
+      syncCall = [particleStorage, mesapdDomain, blockSyncExtension]() {
+         mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+         syncNextNeighborFunc(*particleStorage, *mesapdDomain, blockSyncExtension);
+      };
+      syncCall();
+   }
+   else
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Using ghost owner sync for particles")
+      syncCall = [particleStorage, mesapdDomain, blockSyncExtension]() {
+         mesa_pd::mpi::SyncGhostOwners syncGhostOwnersFunc;
+         syncGhostOwnersFunc(*particleStorage, *mesapdDomain, blockSyncExtension);
+      };
+      for (uint_t i = uint_c(0); i < uint_c(std::ceil(maxPossibleParticleDiameter / real_c(cellsPerBlock.min()))); ++i)
+         syncCall();
+   }
+
+   if (useNoSlipParticles)
+   {
+      particleStorage->forEachParticle(useOpenMP, SphereSelector(), *particleAccessor, particleMappingKernel,
+                                       *particleAccessor, AntidunesBoundaryHandling_T::noSlipFlagID);
+   }
+   else
+   {
+      particleStorage->forEachParticle(useOpenMP, SphereSelector(), *particleAccessor, movingParticleMappingKernel,
+                                       *particleAccessor, AntidunesBoundaryHandling_T::movingObstacleFlagID);
+   }
+
+   // IMPORTANT REMARK: this must be only called after every solid flag has been set; otherwise, the boundary handling
+   // might not detect solid flags correctly
+   antidunesBoundaryHandling->initFlagsFromFillLevel();
+
+   // initialize hydrostatic pressure
+   if (!loadSnapshot) { initHydrostaticPressure< PdfField_T >(blockForest, pdfFieldID, hydrostaticDensityFct); }
+
+   // initialize force density field
+   walberla::free_surface::initForceDensityFieldCodegen< PdfField_T, FlagField_T, VectorFieldFlattened_T,
+                                                         ScalarField_T >(
+      blockForest, forceDensityFieldID, fillFieldID, pdfFieldID, flagFieldID, flagInfo, *acceleration);
+
+   // communication after initialization
+   Communication_T communication(blockForest, flagFieldID, fillFieldID, forceDensityFieldID);
+   communication();
+
+   PdfCommunication_T pdfCommunication(blockForest, pdfFieldID);
+   pdfCommunication();
+
+   // add bubble model
+   std::shared_ptr< walberla::free_surface::bubble_model::BubbleModelBase > bubbleModel =
+      std::make_shared< walberla::free_surface::bubble_model::BubbleModelConstantPressure >(real_c(1));
+
+   // set density in non-liquid or non-interface cells to 1 (after initializing with hydrostatic pressure)
+   // setDensityInNonFluidCellsToOne< FlagField_T, PdfField_T >(blockForest, flagInfo, flagFieldID, pdfFieldID);
+
+   // create timeloop
+   SweepTimeloop timeloop(blockForest, timesteps);
+   timeloop.setCurrentTimeStep(beginTimeStep);
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+   // Laplace pressure = 2 * surface tension * curvature; curvature computation is not necessary with no surface
+   // tension
+   bool computeCurvature = false;
+   if (!realIsEqual(surfaceTension, real_c(0), real_c(1e-14))) { computeCurvature = true; }
+
+   auto blockStateUpdate = StateSweep(blockForest, flagInfo, flagFieldID);
+
+   // add surface geometry handler
+   BlockDataID curvatureFieldID =
+      field::addToStorage< ScalarField_T >(blockForest, "Curvature field", real_c(0), field::fzyx, uint_c(1));
+   BlockDataID normalFieldID = field::addToStorage< VectorField_T >(
+      blockForest, "Normal field", Vector3< real_t >(real_c(0)), field::fzyx, uint_c(1));
+   BlockDataID obstacleNormalFieldID = field::addToStorage< VectorField_T >(
+      blockForest, "Obstacle normal field", Vector3< real_t >(real_c(0)), field::fzyx, uint_c(1));
+   // add field for smoothed fill levels
+   BlockDataID smoothFillFieldID =
+      field::addToStorage< ScalarField_T >(blockForest, "Smooth fill level field", real_c(0), field::fzyx, uint_c(1));
+
+   // smooth fill level field for decreasing error in finite difference normal and curvature computation (see
+   // dissertation of S. Bogner, 2017 (section 4.4.2.1))
+   walberla::free_surface::SmoothingSweep< CommunicationStencil_T, FlagField_T, ScalarField_T, VectorField_T >
+      smoothingSweep(smoothFillFieldID, fillFieldID, flagFieldID,
+                     walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(),
+                     enableWetting);
+   // IMPORTANT REMARK: SmoothingSweep must be executed on all blocks, because the algorithm works on all liquid,
+   // interface and gas cells. This is necessary since the normals are not only computed in interface cells, but also
+   // in the neighborhood of interface cells. Therefore, meaningful values for the fill levels of the second
+   // neighbors of interface cells are also required in NormalSweep.
+   timeloop.add() << Sweep(smoothingSweep, "Sweep: fill level smoothing")
+                  << AfterFunction(Communication_T(blockForest, smoothFillFieldID),
+                                   "Communication: after smoothing sweep");
+
+   // compute interface normals (using smoothed fill level field)
+   walberla::free_surface::NormalSweep< CommunicationStencil_T, FlagField_T, ScalarField_T, VectorField_T > normalSweep(
+      normalFieldID, smoothFillFieldID, flagFieldID, walberla::free_surface::flagIDs::interfaceFlagID,
+      walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(), true, false, true,
+      false);
+   timeloop.add() << Sweep(normalSweep, "Sweep: normal computation", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: normal")
+                  << AfterFunction(Communication_T(blockForest, normalFieldID), "Communication: after normal sweep");
+
+   if (computeCurvature)
+   {
+      // compute interface curvature using finite differences according to Brackbill et al.
+      walberla::free_surface::CurvatureSweepFiniteDifferences< CommunicationStencil_T, FlagField_T, ScalarField_T,
+                                                               VectorField_T >
+         curvSweep(curvatureFieldID, normalFieldID, obstacleNormalFieldID, flagFieldID,
+                   walberla::free_surface::flagIDs::interfaceFlagID,
+                   walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(), false,
+                   real_c(0));
+      timeloop.add() << Sweep(curvSweep, "Sweep: curvature computation (finite difference method)",
+                              StateSweep::fullFreeSurface)
+                     << Sweep(emptySweep(), "Empty sweep: curvature")
+                     << AfterFunction(Communication_T(blockForest, curvatureFieldID),
+                                      "Communication: after curvature sweep");
+   }
+
+   // add surface dynamics handler
+
+   // add standard waLBerla boundary handling
+   timeloop.add() << Sweep(antidunesBoundaryHandling->getBoundarySweep(), "Sweep: boundary handling",
+                           Set< SUID >::emptySet(), StateSweep::onlyGasAndBoundary)
+                  << Sweep(emptySweep(), "Empty sweep: boundary handling", StateSweep::onlyGasAndBoundary);
+
+   // add sweep for weighting force in interface cells with fill level and density
+   // different version for codegen because pystencils does not support 'Ghostlayerfield<Vector3(), 1>'
+   const ForceDensityCodegenSweep< LatticeModel_T, FlagField_T, VectorFieldFlattened_T, ScalarField_T >
+      forceDensityCodegenSweep(forceDensityFieldID, pdfFieldID, flagFieldID, fillFieldID, flagInfo, acceleration);
+   timeloop.add() << Sweep(forceDensityCodegenSweep, "Sweep: force weighting", Set< SUID >::emptySet(),
+                           StateSweep::onlyGasAndBoundary)
+                  << Sweep(emptySweep(), "Empty sweep: force weighting", StateSweep::onlyGasAndBoundary)
+                  << AfterFunction(Communication_T(blockForest, forceDensityFieldID),
+                                   "Communication: after force weighting sweep");
+
+   // sweep for
+   // - reconstruction of PDFs in interface cells
+   // - streaming of PDFs in interface cells (and liquid cells on the same block)
+   // - advection of mass
+   // - update bubble volumes
+   // - marking interface cells for conversion
+   const walberla::free_surface::StreamReconstructAdvectSweep<
+      LatticeModel_T, typename AntidunesBoundaryHandling_T::BoundaryHandling_T, FlagField_T,
+      typename AntidunesBoundaryHandling_T::FlagInfo_T, ScalarField_T, VectorField_T, true >
+      streamReconstructAdvectSweep(surfaceTension, antidunesBoundaryHandling->getHandlingID(), fillFieldID, flagFieldID,
+                                   pdfFieldID, normalFieldID, curvatureFieldID, flagInfo, bubbleModel.get(),
+                                   pdfReconstructionModel, useSimpleMassExchange, cellConversionThreshold,
+                                   cellConversionForceThreshold);
+   // sweep acts only on blocks with at least one interface cell (due to StateSweep::fullFreeSurface)
+   timeloop.add() << Sweep(streamReconstructAdvectSweep, "Sweep: StreamReconstructAdvect", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: StreamReconstructAdvect")
+                  // do not communicate PDFs here:
+                  // - stream on blocks with "StateSweep::fullFreeSurface" was performed here using post-collision PDFs
+                  // - stream on other blocks is performed below and should also use post-collision PDFs
+                  // => if PDFs were communicated here, the ghost layer of other blocks would have post-stream PDFs
+                  << AfterFunction(Communication_T(blockForest, fillFieldID, flagFieldID),
+                                   "Communication: after StreamReconstructAdvect sweep")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                                   "Second ghost layer update: after StreamReconstructAdvect sweep (fill level field)")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after StreamReconstructAdvect sweep (flag field)");
+
+   auto lbmSweepGenerated = typename LatticeModel_T::Sweep(pdfFieldID);
+
+   // temporary class for being able to call the LBM collision with operator()
+   class CollideSweep
+   {
+    public:
+      CollideSweep(const typename LatticeModel_T::Sweep& sweep) : sweep_(sweep){};
+
+      void operator()(IBlock* const block, const uint_t numberOfGhostLayersToInclude = uint_t(0))
+      {
+         sweep_.collide(block, numberOfGhostLayersToInclude);
+      }
+
+    private:
+      typename LatticeModel_T::Sweep sweep_;
+   };
+
+   timeloop.add() << Sweep(CollideSweep(lbmSweepGenerated), "Sweep: collision (generated)", StateSweep::fullFreeSurface)
+                  << Sweep(lbmSweepGenerated, "Sweep: streamCollide (generated)", StateSweep::onlyLBM)
+                  << Sweep(emptySweep(), "Empty sweep: streamCollide (generated)")
+                  << AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                                   "Communication: after streamCollide (generated)");
+
+   // convert cells
+   // - according to the flags from StreamReconstructAdvectSweep (interface -> gas/liquid)
+   // - to ensure a closed layer of interface cells (gas/liquid -> interface)
+   // - detect and register bubble merges/splits (bubble volumes are already updated in StreamReconstructAdvectSweep)
+   // - convert cells and initialize PDFs near inflow boundaries
+   const walberla::free_surface::CellConversionSweep< LatticeModel_T, AntidunesBoundaryHandling_T::BoundaryHandling_T,
+                                                      ScalarField_T >
+      cellConvSweep(antidunesBoundaryHandling->getHandlingID(), pdfFieldID, flagInfo, bubbleModel.get());
+   timeloop.add() << Sweep(cellConvSweep, "Sweep: cell conversion", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: cell conversion")
+                  //<< AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                  //
+                  //                 "Communication: after cell conversion sweep (PDF field)")
+                  // communicate the flag field also in corner directions
+                  << AfterFunction(Communication_T(blockForest, flagFieldID),
+                                   "Communication: after cell conversion sweep (flag field)")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after cell conversion sweep (flag field)");
+
+   // reinitialize PDFs, i.e., refill cells that were converted from gas to interface
+   // - when the flag "convertedFromGasToInterface" has been set (by CellConversionSweep)
+   // - according to the method specified with pdfRefillingModel_
+   const walberla::free_surface::EquilibriumRefillingSweep< LatticeModel_T, FlagField_T > equilibriumRefillingSweep(
+      pdfFieldID, flagFieldID, flagInfo, true);
+   timeloop.add() << Sweep(equilibriumRefillingSweep, "Sweep: EquilibriumRefilling", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: EquilibriumRefilling")
+                  << AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                                   "Communication: after EquilibriumRefilling sweep");
+
+   // distribute excess mass:
+   // - excess mass: mass that is free after conversion from interface to gas/liquid cells
+   // - update the bubble model
+   // IMPORTANT REMARK: this sweep computes the mass via the density, i.e., the PDF field must be up-to-date and the
+   // PdfRefillingSweep must have been performed
+   if (excessMassModel.isEvenlyType())
+   {
+      const walberla::free_surface::ExcessMassDistributionSweepInterfaceEvenly< LatticeModel_T, FlagField_T,
+                                                                                ScalarField_T, VectorField_T >
+         distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo);
+      timeloop.add()
+         << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+         << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+         << AfterFunction(Communication_T(blockForest, fillFieldID),
+                          "Communication: after excess mass distribution sweep")
+         << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                          "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+         // update bubble model, i.e., perform registered bubble merges/splits; bubble merges/splits are
+         // already detected and registered by CellConversionSweep
+         << AfterFunction(std::bind(&walberla::free_surface::bubble_model::BubbleModelBase::update, bubbleModel),
+                          "Sweep: bubble model update");
+   }
+   else
+   {
+      if (excessMassModel.isWeightedType())
+      {
+         const walberla::free_surface::ExcessMassDistributionSweepInterfaceWeighted< LatticeModel_T, FlagField_T,
+                                                                                     ScalarField_T, VectorField_T >
+            distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo, normalFieldID);
+         timeloop.add()
+            << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+            << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+            << AfterFunction(Communication_T(blockForest, fillFieldID),
+                             "Communication: after excess mass distribution sweep")
+            << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                             "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+            // update bubble model, i.e., perform registered bubble merges/splits; bubble merges/splits
+            // are already detected and registered by CellConversionSweep
+            << AfterFunction(std::bind(&walberla::free_surface::bubble_model::BubbleModelBase::update, bubbleModel),
+                             "Sweep: bubble model update");
+      }
+      else
+      {
+         if (excessMassModel.isEvenlyAllInterfaceFallbackLiquidType())
+         {
+            const walberla::free_surface::ExcessMassDistributionSweepInterfaceAndLiquid< LatticeModel_T, FlagField_T,
+                                                                                         ScalarField_T, VectorField_T >
+               distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo, excessMassFieldID);
+            timeloop.add()
+               // perform this sweep also on "onlyLBM" blocks because liquid cells also exchange excess mass here
+               << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+               << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::onlyLBM)
+               << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+               << AfterFunction(Communication_T(blockForest, fillFieldID, excessMassFieldID),
+                                "Communication: after excess mass distribution sweep")
+               << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                                "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+               // update bubble model, i.e., perform registered bubble merges/splits; bubble
+               // merges/splits are already detected and registered by CellConversionSweep
+               << AfterFunction(std::bind(&walberla::free_surface::bubble_model::BubbleModelBase::update, bubbleModel),
+                                "Sweep: bubble model update");
+         }
+      }
+   }
+
+   // reset all flags that signal cell conversions (except "keepInterfaceForWettingFlag")
+   walberla::free_surface::ConversionFlagsResetSweep< FlagField_T > resetConversionFlagsSweep(flagFieldID, flagInfo);
+   timeloop.add() << Sweep(resetConversionFlagsSweep, "Sweep: conversion flag reset", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: conversion flag reset")
+                  << AfterFunction(Communication_T(blockForest, flagFieldID),
+                                   "Communication: after excess mass distribution sweep")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after excess mass distribution sweep (flag field)");
+
+   // update block states
+   timeloop.add() << Sweep(blockStateUpdate, "Sweep: block state update");
+
+   // add VTK output
+   walberla::free_surface::addVTKOutput< LatticeModel_T, AntidunesBoundaryHandling_T, PdfField_T, FlagField_T,
+                                         ScalarField_T, VectorField_T >(
+      blockForest, timeloop, configPtr, flagInfo, pdfFieldID, flagFieldID, fillFieldID, BlockDataID(), curvatureFieldID,
+      normalFieldID, obstacleNormalFieldID);
+
+   // add triangle mesh output of free surface
+   walberla::free_surface::SurfaceMeshWriter< ScalarField_T, FlagField_T > surfaceMeshWriter(
+      blockForest, fillFieldID, flagFieldID, walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, real_c(0),
+      configPtr);
+   surfaceMeshWriter(); // write initial mesh
+   timeloop.addFuncAfterTimeStep(surfaceMeshWriter, "Writer: surface mesh");
+
+   if (vtkSpacingParticles != uint_t(0))
+   {
+      // particle field
+      auto particleFieldVTK =
+         vtk::createVTKOutput_BlockData(blockForest, "particle_field", vtkSpacingParticles, 0, false, vtkFolder);
+      auto cellBB_filterParameters             = vtkFluidParameters.getOneBlock("CellBB_filter");
+      const Vector3< uint_t > cellBB_filterMin = cellBB_filterParameters.getParameter< Vector3< uint_t > >("min");
+      const Vector3< uint_t > cellBB_filterMax = cellBB_filterParameters.getParameter< Vector3< uint_t > >("max");
+      AABB sliceAABB(real_c(cellBB_filterMin[0]), real_c(cellBB_filterMin[1]), real_c(cellBB_filterMin[2]),
+                     real_c(cellBB_filterMax[0] + uint_t(1)), real_c(cellBB_filterMax[1] + uint_t(1)),
+                     real_c(cellBB_filterMax[2] + uint_t(1)));
+
+      particleFieldVTK->addCellInclusionFilter(vtk::AABBCellFilter(sliceAABB));
+      particleFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< GhostLayerField< walberla::id_t, 1 > > >(particleFieldID, "particleField"));
+      particleFieldVTK->setSamplingResolution(vtkFluidParameters.getParameter< real_t >("samplingResolution"));
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleFieldVTK), "VTK (particle field data");
+   }
+
+   if (vtkSpacingParticles != uint_t(0))
+   {
+      // sphere
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(particleStorage);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         using namespace walberla::mesa_pd::data::particle_flags;
+         return (pIt->getBaseShape()->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE) &&
+                !isSet(pIt->getFlags(), GHOST);
+      });
+      auto particleVtkWriter =
+         vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacingParticles, vtkFolder,
+                                        std::string("simulation_step"), false, true, true, true, beginTimeStep);
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+   }
+
+   // add logging for computational performance
+   const lbm::PerformanceLogger< FlagField_T > performanceLogger(
+      blockForest, flagFieldID, walberla::free_surface::flagIDs::liquidInterfaceFlagIDs, performanceLogFrequency);
+   timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluator: performance logging");
+
+   // LBM stability check
+   timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                    walberlaEnv.config(), blockForest, pdfFieldID, flagFieldID,
+                                    walberla::free_surface::flagIDs::liquidInterfaceFlagIDs)),
+                                 "LBM stability check");
+
+   // add sweep for evaluating the fluid's mean velocity
+   const std::shared_ptr< Vector3< real_t > > meanVelocity = std::make_shared< Vector3< real_t > >(real_c(0));
+   const real_t velocityAveragingFactor = real_c(1) / (liquidHeight * real_c(domainSize[0]) * real_c(domainSize[1]));
+   MeanVelocityComputer< AntidunesBoundaryHandling_T, PdfField_T, FlagField_T > meanVelocityComputer(
+      blockForest, antidunesBoundaryHandling, pdfFieldID, meanVelocity, velocityAveragingFactor);
+
+   // PID Controller
+   shared_ptr< ForcingAdjuster > forcingAdjuster =
+      make_shared< ForcingAdjuster >(blockForest, targetMeanVelocityMagnitude, (*acceleration)[0], proportionalGain,
+                                     derivativeGain, integralGain, maxRamp, minActuatingVariable, maxActuatingVariable);
+
+   if (loadSnapshot) { forcingAdjuster->loadPIDSnapshot(snapshotBaseFolder + "/" + "pidState.file"); }
+
+   WcTimingPool timingPool;
+
+   // this is carried out after the particle integration, it corrects the flag field and restores missing PDF
+   // information then, the checkpointing file can be written, as otherwise some cells are invalid and can not be
+   // recovered
+   SweepTimeloop timeloopAfterParticles(blockForest, timesteps);
+   timeloopAfterParticles.setCurrentTimeStep(beginTimeStep);
+
+   // sweep for updating the particle mapping into the LBM simulation
+   bool strictlyConserveMomentum = false;
+   timeloopAfterParticles.add() << Sweep(
+      lbm_mesapd_coupling::makeMovingParticleMapping< PdfField_T, AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+         blockForest, pdfFieldID, antidunesBoundaryHandling->getHandlingID(), particleFieldID, particleAccessor,
+         AntidunesBoundaryHandling_T::movingObstacleFlagID, FormerMO_Flag,
+         lbm_mesapd_coupling::RegularParticlesSelector(), strictlyConserveMomentum),
+      "Particle Mapping");
+
+   // sweep for restoring PDFs in cells previously occupied by particles
+   bool reconstruction_recomputeTargetDensity = false;
+   bool reconstruction_useCentralDifferences  = true;
+   auto gradReconstructor =
+      lbm_mesapd_coupling::makeGradsMomentApproximationReconstructor< AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+         blockForest, antidunesBoundaryHandling->getHandlingID(), relaxationRate, reconstruction_recomputeTargetDensity,
+         reconstruction_useCentralDifferences);
+
+   timeloopAfterParticles.add()
+      << Sweep(makeSharedSweep(
+                  lbm_mesapd_coupling::makePdfReconstructionManager< PdfField_T,
+                                                                     AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+                     blockForest, pdfFieldID, antidunesBoundaryHandling->getHandlingID(), particleFieldID,
+                     particleAccessor, FormerMO_Flag, walberla::free_surface::flagIDs::liquidFlagID, gradReconstructor,
+                     strictlyConserveMomentum)),
+               "PDF Restore")
+      << AfterFunction(Communication_T(blockForest, flagFieldID, particleFieldID),
+                       "Communication: after PDF reconstruction sweep") // unsure if necessary but added for consistency
+      << AfterFunction(pdfCommunication, "PDF Communication");
+
+   real_t timeStepSizeParticles = real_c(1) / real_c(particleNumSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(timeStepSizeParticles);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeParticles);
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(1);
+   collisionResponse.setFrictionCoefficientDynamic(size_t(0), size_t(0), particleFrictionCoefficient);
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   real_t particleCollisionTime = particleCollisionTimeNonDim * avgParticleDiameter;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(
+      viscosity, [](real_t r) { return (real_c(0.001 + real_c(0.00007) * r)) * r; });
+
+   WALBERLA_LOG_INFO_ON_ROOT("Will use particle time step size of "
+                             << timeStepSizeParticles << " and collision time of " << particleCollisionTime);
+
+   AverageDataSliceEvaluator< PdfField_T, AntidunesBoundaryHandling_T, FlagField_T, ScalarField_T >
+      averageDataSliceEvaluator(blockForest, flagFieldID, fillFieldID, pdfFieldID);
+
+   std::shared_ptr< real_t > totalFluidMass = std::make_shared< real_t >(real_c(0));
+   walberla::free_surface::TotalMassComputer< AntidunesBoundaryHandling_T, PdfField_T, FlagField_T, ScalarField_T >
+      totalFluidMassEvaluator(blockForest, antidunesBoundaryHandling, pdfFieldID, fillFieldID, evaluationFrequency,
+                              totalFluidMass);
+
+   BedloadTransportEvaluator< ParticleAccessor_T > bedloadTransportEvaluator(
+      particleAccessor, real_c(1) / real_c(domainSize[0] * domainSize[1]), numParticles);
+   auto bedLoadTransportFileName = baseFolderName + "/bedload.txt";
+   WALBERLA_LOG_INFO_ON_ROOT("Writing bedload info to file " << bedLoadTransportFileName);
+
+   auto fluidInfoFileName = baseFolderName + "/fluidInfo.txt";
+   WALBERLA_LOG_INFO_ON_ROOT("Writing fluid info to file " << fluidInfoFileName);
+
+   // write info file
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream evalInfoFile(baseFolderName + "/info.txt");
+      evalInfoFile << evaluationFrequency << "\n";
+      evalInfoFile << gravity << "\n";
+      evalInfoFile << viscosity << "\n";
+      evalInfoFile << particleDensityRatio << "\n";
+      evalInfoFile << avgParticleDiameter << "\n";
+      evalInfoFile << domainSize[0] << "\n";
+      evalInfoFile << domainSize[1] << "\n";
+      evalInfoFile << domainSize[2] << "\n";
+      evalInfoFile << numParticles << "\n";
+      evalInfoFile << dx_SI << "\n";
+      evalInfoFile << dt_SI << "\n";
+      evalInfoFile.close();
+   }
+
+   Vector3< real_t > totalHydrodynamicForceOnParticles(real_c(0)); // only root will have valid values
+
+   for (uint_t t = beginTimeStep; t != timesteps; ++t)
+   {
+      timeloop.singleStep(timingPool, true);
+
+      timingPool["Mesa_pd"].start();
+
+      reduceProperty.operator()< mesa_pd::HydrodynamicForceTorqueNotification >(*particleStorage);
+
+      if (t == 0)
+      {
+         lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel
+            initializeHydrodynamicForceTorqueForAveragingKernel;
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          initializeHydrodynamicForceTorqueForAveragingKernel, *particleAccessor);
+      }
+      particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                       averageHydrodynamicForceTorque, *particleAccessor);
+
+      for (auto subCycle = uint_t(0); subCycle < particleNumSubCycles; ++subCycle)
+      {
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          vvIntegratorPreForce, *particleAccessor);
+         syncCall();
+
+         if (useLubricationCorrection)
+         {
+            // lubrication correction
+            particleStorage->forEachParticlePairHalf(
+               useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *particleAccessor,
+               [&lubricationCorrectionKernel, &mesapdDomain](const size_t idx1, const size_t idx2, auto& ac) {
+                  mesa_pd::collision_detection::AnalyticContactDetection acd;
+                  acd.getContactThreshold() = lubricationCorrectionKernel.getNormalCutOffDistance();
+                  mesa_pd::kernel::DoubleCast double_cast;
+                  mesa_pd::mpi::ContactFilter contact_filter;
+                  if (double_cast(idx1, idx2, ac, acd, ac))
+                  {
+                     if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *mesapdDomain))
+                     {
+                        double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac,
+                                    acd.getContactNormal(), acd.getPenetrationDepth());
+                     }
+                  }
+               },
+               *particleAccessor);
+         }
+
+         // collision response
+         particleStorage->forEachParticlePairHalf(
+            useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *particleAccessor,
+            [&collisionResponse, &mesapdDomain, timeStepSizeParticles, particleRestitutionCoefficient,
+             particleCollisionTime, particleKappa](const size_t idx1, const size_t idx2, auto& ac) {
+               mesa_pd::collision_detection::AnalyticContactDetection acd;
+               mesa_pd::kernel::DoubleCast double_cast;
+               mesa_pd::mpi::ContactFilter contact_filter;
+               if (double_cast(idx1, idx2, ac, acd, ac))
+               {
+                  if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *mesapdDomain))
+                  {
+                     auto meff = real_c(1) / (ac.getInvMass(idx1) + ac.getInvMass(idx2));
+                     collisionResponse.setStiffnessAndDamping(0, 0, particleRestitutionCoefficient,
+                                                              particleCollisionTime, particleKappa, meff);
+                     collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                                       acd.getPenetrationDepth(), timeStepSizeParticles);
+                  }
+               }
+            },
+            *particleAccessor);
+
+         reduceAndSwapContactHistory(*particleStorage);
+
+         // add hydrodynamic force
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          addHydrodynamicInteraction, *particleAccessor);
+
+         // add external forces
+         particleStorage->forEachParticle(
+            useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+            [particleDensityRatio, acceleration](const size_t idx, auto& ac) {
+               mesa_pd::addForceAtomic(idx, ac,
+                                       ac.getVolume(idx) *
+                                          Vector3< real_t >((*acceleration)[0], (*acceleration)[1],
+                                                            (particleDensityRatio - real_c(1)) * (*acceleration)[2]));
+            },
+            *particleAccessor);
+
+         reduceProperty.operator()< mesa_pd::ForceTorqueNotification >(*particleStorage);
+
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          vvIntegratorPostForce, *particleAccessor);
+         syncCall();
+      }
+
+      // has to be evaluated here before the force info is erased from particles
+      if (t % evaluationFrequency == uint_c(0))
+         totalHydrodynamicForceOnParticles = getTotalHydrodynamicForceOnParticles(particleAccessor);
+
+      particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *particleAccessor,
+                                       resetHydrodynamicForceTorque, *particleAccessor);
+      timingPool["Mesa_pd"].end();
+
+      // update particle mapping
+      timeloopAfterParticles.singleStep(timingPool, true);
+
+      timingPool["Evaluation"].start();
+
+      if (t % evaluationFrequency == uint_c(0))
+      {
+         averageDataSliceEvaluator();
+         totalFluidMassEvaluator.computeMass(blockForest, antidunesBoundaryHandling);
+         bedloadTransportEvaluator();
+         meanVelocityComputer();
+
+         WALBERLA_ROOT_SECTION()
+         {
+            write2DVectorToFile(averageDataSliceEvaluator.getSolidVolumeFractionVector(),
+                                averageDataSliceEvaluator.getXLen(), averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/svfSlice_" + std::to_string(t) + ".txt");
+            write2DVectorToFile(averageDataSliceEvaluator.getFillLevelVector(), averageDataSliceEvaluator.getXLen(),
+                                averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/fillSlice_" + std::to_string(t) + ".txt");
+            write2DVectorToFile(averageDataSliceEvaluator.getVelocityXVector(), averageDataSliceEvaluator.getXLen(),
+                                averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/velXSlice_" + std::to_string(t) + ".txt");
+
+            std::ofstream bedloadFile(bedLoadTransportFileName, std::ofstream::app);
+            bedloadFile << t << " " << bedloadTransportEvaluator.getTransportRate() << " "
+                        << bedloadTransportEvaluator.getAverageVelocity() << " " << totalHydrodynamicForceOnParticles[0]
+                        << " " << totalHydrodynamicForceOnParticles[1] << " " << totalHydrodynamicForceOnParticles[2]
+                        << "\n";
+            bedloadFile.close();
+
+            WALBERLA_LOG_DEVEL("____________________________________________________________________");
+            WALBERLA_LOG_DEVEL("time step = " << t);
+            const real_t froudeNumber =
+               (*meanVelocity)[0] / real_c(std::sqrt(liquidHeight * std::abs((*acceleration)[2])));
+
+            const real_t reynoldsNumber = (*meanVelocity)[0] * liquidHeight / viscosity;
+
+            const real_t weberNumber =
+               real_c(1.0) * (*meanVelocity)[0] * (*meanVelocity)[0] * liquidHeight / surfaceTension;
+
+            WALBERLA_LOG_DEVEL(" - Total fluid mass = " << std::setprecision(16) << (*totalFluidMass));
+            auto maxFluidZPos = averageDataSliceEvaluator.getMaxFluidZPos();
+            WALBERLA_LOG_DEVEL(" - Max fluid z-position = " << maxFluidZPos);
+            WALBERLA_LOG_DEVEL(" - Froude number = " << froudeNumber);
+            WALBERLA_LOG_DEVEL(" - Reynolds number = " << reynoldsNumber);
+            WALBERLA_LOG_DEVEL(" - We = " << weberNumber);
+
+            WALBERLA_LOG_DEVEL(" - meanVelocity = " << *meanVelocity);
+
+            std::ofstream fluidInfoFile(fluidInfoFileName, std::ofstream::app);
+            fluidInfoFile << t << " " << (*acceleration)[0] << " " << (*meanVelocity)[0] << " " << maxFluidZPos << " "
+                          << std::setprecision(16) << (*totalFluidMass) << "\n";
+            fluidInfoFile.close();
+
+            if (std::isnan(reynoldsNumber)) WALBERLA_ABORT("reynoldsNumber is inf!")
+         }
+
+         WALBERLA_LOG_DEVEL_ON_ROOT(
+            " -> CurrentExternalAcceleration in x-direction before update = " << (*acceleration)[0]);
+         (*forcingAdjuster)(meanVelocity->length());
+         (*acceleration)[0] = forcingAdjuster->getExternalForcing();
+         WALBERLA_LOG_DEVEL_ON_ROOT(
+            " -> CurrentExternalAcceleration in x-direction after update  = " << (*acceleration)[0]);
+      }
+      timingPool["Evaluation"].end();
+
+      if (storeSnapshot)
+      {
+         if (t % snapshotFrequency == uint_c(0) && t > uint_c(0))
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Writing checkpointing file in time step " << t)
+
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" + pdfFieldFile, pdfFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" + fillFieldFile, fillFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" + excessMassFieldFile, excessMassFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" + particleStorageFile, particleStorageID);
+
+            WALBERLA_ROOT_SECTION()
+            {
+               std::string tmpCheckpointConfigFile = snapshotBaseFolder + "/tmp_" + checkpointConfigFile;
+               std::ofstream file;
+               file.open(tmpCheckpointConfigFile.c_str());
+
+               file << std::setprecision(16);
+               file << t + 1 << "\n";
+               file << (*acceleration)[0] << "\n";
+               file.close();
+            }
+
+            forcingAdjuster->storePIDSnapshot(snapshotBaseFolder + "/" + "pidState.file");
+
+            WALBERLA_MPI_BARRIER();
+
+            // rename checkpoint files to "real" ones
+            // otherwise, the checkpointed state might be incomplete if the simulation stops due to over time during
+            // checkpointing
+            WALBERLA_ROOT_SECTION()
+            {
+               renameFile(snapshotBaseFolder + "/tmp_" + pdfFieldFile, snapshotBaseFolder + "/" + pdfFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" + fillFieldFile, snapshotBaseFolder + "/" + fillFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" + excessMassFieldFile,
+                          snapshotBaseFolder + "/" + excessMassFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" + particleStorageFile,
+                          snapshotBaseFolder + "/" + particleStorageFile);
+               renameFile(snapshotBaseFolder + "/tmp_" + checkpointConfigFile,
+                          snapshotBaseFolder + "/" + checkpointConfigFile);
+            }
+         }
+      }
+
+      if (t % performanceLogFrequency == uint_c(0) && t > uint_c(0)) { timingPool.logResultOnRoot(); }
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace antidunes
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::antidunes::main(argc, argv); }
diff --git a/apps/showcases/Antidunes/Antidunes.prm b/apps/showcases/Antidunes/Antidunes.prm
new file mode 100644
index 0000000000000000000000000000000000000000..d95db01d09400e7538e58ce99c93337bde5a0a21
--- /dev/null
+++ b/apps/showcases/Antidunes/Antidunes.prm
@@ -0,0 +1,146 @@
+BlockForestParameters
+{
+   cellsPerBlock                 < 50, 20, 40 >;
+   periodicity                   < 1, 1, 0 >;
+   loadSnapshot                  false;
+   storeSnapshot                 true;
+   snapshotFrequency             10000;
+   snapshotBaseFolder            snapshots;
+}
+
+DomainParameters
+{
+   domainSize         <3200, 60, 160>;
+   wavePeriods        1;         // never set to 0 -> division by zero, even if you initialize a flat particle bed
+   liquidHeightFactor 2.862;    // h_0 / d (water height / avg particle diameter) -> from experiment [E1=2.862, E2=3.1724, E3=3.27586, E4=3.5862]
+   floorHeightFactor  4.1393;    // from domain bottom to sine's average
+   initialAmplitude   0;         // defined from minimum peak to maximum peak as by Pascal et al. (2021)
+}
+
+PIDParameters
+{
+    targetMeanVelocityMagnitude 0.02;
+    proportionalGain            2e-4;
+    derivativeGain              1e-6;
+    integralGain                2e-4;
+    maxRamp                     1e-4;
+    minActuatingVariable        0;
+    maxActuatingVariable        1e-3;
+}
+
+PhysicsParameters
+{
+   enableWetting     false;
+   timesteps         2000000;
+   Re                3100;       // [E1=3100, E2=3772, E3=4180, E4=4800]
+   Fr                1.31;       // [E1=1.31, E2=1.38, E3=1.44, E4=1.45]
+   We                15.6188;    // [E1=15.6188, E2=21.48, E3=25.54, E4=30.2493]
+}
+
+ParticleParameters
+{
+    inFileName spheres_out.dat;
+    bedCopiesInX 1;
+    bedCopiesInY 1;
+    densityRatio 2.55;
+    fixingHeightFactor 1.5; // proportional to the mean particle diameter
+    frictionCoefficient 0.5;
+    restitutionCoefficient 0.97;
+    numSubCycles 10;
+    useLubricationCorrection true;
+    useNoSlipParticles false;
+}
+
+ModelParameters
+{
+   pdfReconstructionModel        OnlyMissing;
+   pdfRefillingModel             EquilibriumRefilling;
+   excessMassDistributionModel   EvenlyNewInterfaceFallbackLiquid;
+   curvatureModel                FiniteDifferenceMethod;
+   useSimpleMassExchange         false;
+   cellConversionThreshold       1e-2;
+   cellConversionForceThreshold  1e-1;
+}
+
+EvaluationParameters
+{
+   performanceLogFrequency 10000;
+   evaluationFrequency     1000;
+   baseFolderName          eval;
+}
+
+StabilityChecker
+{
+   checkFrequency 0;
+   streamOutput   false;
+   vtkOutput      true;
+}
+
+BoundaryParameters
+{
+   // X
+   //Border { direction W;  walldistance -1; NoSlip{} }
+   //Border { direction E;  walldistance -1; NoSlip{} }
+
+   // Y
+   //Border { direction N;  walldistance -1; NoSlip{} }
+   //Border { direction S;  walldistance -1; NoSlip{} }
+
+   // Z
+   Border { direction T;  walldistance -1; NoSlip{} }
+   Border { direction B;  walldistance -1; NoSlip{} }
+}
+
+MeshOutputParameters
+{
+   writeFrequency 0;
+   baseFolder     mesh-out;
+}
+
+VTK
+{
+   fluid_field
+   {
+      writeFrequency       1000;
+      ghostLayers          0;
+      baseFolder           vtk-out;
+      samplingResolution   1;
+
+      writers
+      {
+         fill_level;
+         mapped_flag;
+         velocity;
+         density;
+         //curvature;
+         //normal;
+         //obstacle_normal;
+         //pdf;
+         //flag;
+         //force;
+      }
+
+      CellBB_filter {
+         min < 0, 29, 0 >;
+         max < 3200, 30, 160 >;
+      }
+
+      inclusion_filters
+      {
+         CellBB_filter;
+         //liquidInterfaceFilter; // only include liquid and interface cells in VTK output
+      }
+
+      before_functions
+      {
+         //ghost_layer_synchronization; // only needed if writing the ghost layer
+         gas_cell_zero_setter;          // sets velocity=0 and density=1 all gas cells before writing VTK output
+      }
+   }
+   domain_decomposition
+   {
+      writeFrequency             10000000000;
+      baseFolder                 vtk-out;
+      outputDomainDecomposition  true;
+   }
+}
diff --git a/apps/showcases/Antidunes/AntidunesBoundaryHandling.h b/apps/showcases/Antidunes/AntidunesBoundaryHandling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f210d8e74f3a638ddfa691f3c1cac05bba14acd8
--- /dev/null
+++ b/apps/showcases/Antidunes/AntidunesBoundaryHandling.h
@@ -0,0 +1,207 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Boundary.h
+//! \ingroup free_surface
+//! \author Martin Bauer
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//! \brief Boundary handling for the free surface LBM module.
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "boundary/BoundaryHandling.h"
+
+#include "field/GhostLayerField.h"
+
+#include "geometry/initializer/InitializationManager.h"
+#include "geometry/initializer/OverlapFieldFromBody.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/free_surface/FlagInfo.h"
+#include "lbm/free_surface/InitFunctions.h"
+#include "lbm/free_surface/InterfaceFromFillLevel.h"
+#include "lbm/free_surface/boundary/SimplePressureWithFreeSurface.h"
+
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h"
+
+namespace walberla
+{
+namespace antidunes
+{
+namespace free_surface
+{
+/***********************************************************************************************************************
+ * Boundary handling for the free surface LBM extension.
+ **********************************************************************************************************************/
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+class AntidunesBoundaryHandling
+{
+ public:
+   using flag_t    = typename FlagField_T::value_type;
+   using Stencil_T = typename LatticeModel_T::Stencil;
+   using CommunicationStencil_T =
+      typename std::conditional< LatticeModel_T::Stencil::D == uint_t(2), stencil::D2Q9, stencil::D3Q27 >::type;
+   using PdfField_T = lbm::PdfField< LatticeModel_T >;
+
+   // boundary
+   using NoSlip_T   = lbm::NoSlip< LatticeModel_T, flag_t >;
+   using FreeSlip_T = lbm::FreeSlip< LatticeModel_T, FlagField_T >;
+   using UBB_T      = lbm::UBB< LatticeModel_T, flag_t >;
+   using Pressure_T = walberla::free_surface::SimplePressureWithFreeSurface< LatticeModel_T, FlagField_T >;
+   using Outlet_T   = lbm::Outlet< LatticeModel_T, FlagField_T, 4, 3 >;
+   using UBB_Inflow_T =
+      lbm::UBB< LatticeModel_T, flag_t >; // creates interface cells in the direction of the prescribed velocity, i.e.,
+                                          // represents an inflow boundary condition
+   using MovingObstacle_T = lbm_mesapd_coupling::SimpleBB< LatticeModel_T, FlagField_T, ParticleAccessor_T >;
+
+   // handling type
+   using BoundaryHandling_T =
+      BoundaryHandling< FlagField_T, Stencil_T, NoSlip_T, UBB_T, UBB_Inflow_T, Pressure_T, Pressure_T, Outlet_T,
+                        FreeSlip_T,
+                        MovingObstacle_T >; // 2 pressure boundaries with different densities, e.g., inflow-outflow
+
+   using FlagInfo_T = walberla::free_surface::FlagInfo< FlagField_T >;
+
+   // constructor
+   AntidunesBoundaryHandling(const std::shared_ptr< StructuredBlockForest >& blockForest, BlockDataID pdfFieldID,
+                             BlockDataID fillLevelID, BlockDataID particleFieldID,
+                             const shared_ptr< ParticleAccessor_T >& ac,
+                             std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct);
+
+   // initialize fluid field from config file using (quotes indicate the string to be used in the file):
+   // - "CellInterval" (see src/geometry/initializer/BoundaryFromCellInterval.h)
+   // - "Border" (see src/geometry/initializer/BoundaryFromDomainBorder.h)
+   // - "Image" (see src/geometry/initializer/BoundaryFromImage.h)
+   // - "Body" (see src/geometry/initializer/OverlapFieldFromBody.h)
+   inline void initFromConfig(const Config::BlockHandle& block);
+
+   // initialize free surface object from geometric body (see src/geometry/initializer/OverlapFieldFromBody.h)
+   template< typename Body_T >
+   inline void addFreeSurfaceObject(const Body_T& body, bool addOrSubtract = false);
+
+   // clear and initialize flags in every cell according to the fill level and initialize fill level in boundaries (with
+   // value 1) and obstacles such that the bubble model does not detect obstacles as gas cells
+   void initFlagsFromFillLevel();
+
+   inline void setNoSlipAtBorder(stencil::Direction d, cell_idx_t wallDistance = cell_idx_c(0));
+   inline void setNoSlipAtAllBorders(cell_idx_t wallDistance = cell_idx_c(0));
+   void setNoSlipInCell(const Cell& globalCell);
+
+   inline void setFreeSlipAtBorder(stencil::Direction d, cell_idx_t wallDistance = cell_idx_c(0));
+   inline void setFreeSlipAtAllBorders(cell_idx_t wallDistance = cell_idx_c(0));
+   void setFreeSlipInCell(const Cell& globalCell);
+
+   void setUBBInCell(const Cell& globalCell, const Vector3< real_t >& velocity);
+
+   // UBB that generates interface cells to resemble an inflow boundary
+   void setInflowInCell(const Cell& globalCell, const Vector3< real_t >& velocity);
+
+   inline void setPressure(real_t density);
+   void setPressureOutflow(real_t density);
+   void setBodyForce(const Vector3< real_t >& bodyForce);
+
+   void enableBubbleOutflow(BubbleModelBase* bubbleModel);
+
+   // checks if an obstacle cell is located in an outermost ghost layer (corners are explicitly ignored, as they do not
+   // influence periodic communication)
+   Vector3< bool > isObstacleInGlobalGhostLayer();
+
+   // flag management
+   const walberla::free_surface::FlagInfo< FlagField_T >& getFlagInfo() const { return flagInfo_; }
+
+   // flag IDs
+   static const field::FlagUID noSlipFlagID;
+   static const field::FlagUID ubbFlagID;
+   static const field::FlagUID ubbInflowFlagID;
+   static const field::FlagUID pressureFlagID;
+   static const field::FlagUID pressureOutflowFlagID;
+   static const field::FlagUID outletFlagID;
+   static const field::FlagUID freeSlipFlagID;
+   static const field::FlagUID movingObstacleFlagID;
+
+   // boundary IDs
+   static const BoundaryUID noSlipBoundaryID;
+   static const BoundaryUID ubbBoundaryID;
+   static const BoundaryUID ubbInflowBoundaryID;
+   static const BoundaryUID pressureBoundaryID;
+   static const BoundaryUID pressureOutflowBoundaryID;
+   static const BoundaryUID outletBoundaryID;
+   static const BoundaryUID freeSlipBoundaryID;
+   static const BoundaryUID movingObstacleBoundaryID;
+
+   inline BlockDataID getHandlingID() const { return handlingID_; }
+   inline BlockDataID getPdfFieldID() const { return pdfFieldID_; }
+   inline BlockDataID getFillFieldID() const { return fillFieldID_; }
+   inline BlockDataID getFlagFieldID() const { return flagFieldID_; }
+   inline BlockDataID getParticleFieldID() const { return particleFieldID_; }
+
+   inline std::shared_ptr< StructuredBlockForest > getBlockForest() const { return blockForest_; };
+   inline shared_ptr< ParticleAccessor_T > getParticleAccessor() const { return ac_; }
+   inline std::function< real_t(const Vector3< real_t >&) > getHydrostaticDensityFct() const
+   {
+      return hydrostaticDensityFct_;
+   }
+
+   // executes standard waLBerla boundary handling
+   class ExecuteBoundaryHandling
+   {
+    public:
+      ExecuteBoundaryHandling(const BlockDataID& collection) : handlingID_(collection) {}
+      void operator()(IBlock* const block) const
+      {
+         BoundaryHandling_T* const handling = block->getData< BoundaryHandling_T >(handlingID_);
+         // reset "near boundary" flags
+         handling->refresh();
+         (*handling)();
+      }
+
+    protected:
+      BlockDataID handlingID_;
+   }; // class ExecuteBoundaryHandling
+
+   ExecuteBoundaryHandling getBoundarySweep() const { return ExecuteBoundaryHandling(getHandlingID()); }
+
+ private:
+   walberla::free_surface::FlagInfo< FlagField_T > flagInfo_;
+
+   // register standard waLBerla initializers
+   geometry::initializer::InitializationManager getInitManager();
+
+   std::shared_ptr< StructuredBlockForest > blockForest_;
+
+   BlockDataID flagFieldID_;
+   BlockDataID pdfFieldID_;
+   BlockDataID fillFieldID_;
+   BlockDataID particleFieldID_;
+   shared_ptr< ParticleAccessor_T > ac_;
+   std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct_;
+
+   BlockDataID handlingID_;
+
+   blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > comm_;
+}; // class AntidunesBoundaryHandling
+
+} // namespace free_surface
+} // namespace antidunes
+} // namespace walberla
+
+#include "AntidunesBoundaryHandling.impl.h"
diff --git a/apps/showcases/Antidunes/AntidunesBoundaryHandling.impl.h b/apps/showcases/Antidunes/AntidunesBoundaryHandling.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3aa86f16a5e3d93bcf030e4235b898242767ea90
--- /dev/null
+++ b/apps/showcases/Antidunes/AntidunesBoundaryHandling.impl.h
@@ -0,0 +1,583 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AntidunesBoundaryHandling.impl.h
+//! \ingroup free_surface
+//! \author Martin Bauer
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//! \brief Boundary handling for the free surface LBM module.
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+
+#include "geometry/initializer/BoundaryFromCellInterval.h"
+#include "geometry/initializer/BoundaryFromDomainBorder.h"
+#include "geometry/initializer/BoundaryFromImage.h"
+#include "geometry/structured/GrayScaleImage.h"
+
+#include "lbm/free_surface/FlagInfo.h"
+#include "lbm/free_surface/InterfaceFromFillLevel.h"
+#include "lbm/lattice_model/CollisionModel.h"
+
+#include "AntidunesBoundaryHandling.h"
+
+namespace walberla
+{
+namespace antidunes
+{
+namespace free_surface
+{
+namespace internal
+{
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+class BoundaryBlockDataHandling
+   : public domain_decomposition::BlockDataHandling< typename AntidunesBoundaryHandling<
+        LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::BoundaryHandling_T >
+{
+ public:
+   using BoundaryHandling_T =
+      typename AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                          ParticleAccessor_T >::BoundaryHandling_T; // handling as defined in
+                                                                                    // AntidunesBoundaryHandling.h
+
+   BoundaryBlockDataHandling(
+      const AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >* boundary)
+      : boundary_(boundary)
+   {}
+
+   // initialize standard waLBerla boundary handling
+   BoundaryHandling_T* initialize(IBlock* const block)
+   {
+      using B      = AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >;
+      using flag_t = typename B::flag_t;
+
+      // get fields
+      FlagField_T* const flagField           = block->getData< FlagField_T >(boundary_->getFlagFieldID());
+      typename B::PdfField_T* const pdfField = block->getData< typename B::PdfField_T >(boundary_->getPdfFieldID());
+      lbm_mesapd_coupling::ParticleField_T* const particleField =
+         block->getData< lbm_mesapd_coupling::ParticleField_T >(boundary_->getParticleFieldID());
+
+      auto interfaceFlag = flag_t(flagField->getFlag(walberla::free_surface::flagIDs::interfaceFlagID));
+      auto liquidFlag    = flag_t(flagField->getFlag(walberla::free_surface::flagIDs::liquidFlagID));
+
+      // domainMask is used to identify liquid and interface cells
+      auto domainMask = flag_t(liquidFlag | interfaceFlag);
+      WALBERLA_ASSERT(domainMask != 0);
+
+      // initialize boundary conditions
+      typename B::UBB_T ubb(B::ubbBoundaryID, B::ubbFlagID, pdfField, flagField);
+      typename B::UBB_Inflow_T ubbInflow(B::ubbInflowBoundaryID, B::ubbInflowFlagID, pdfField, flagField);
+      typename B::NoSlip_T noslip(B::noSlipBoundaryID, B::noSlipFlagID, pdfField);
+      typename B::Pressure_T pressure(B::pressureBoundaryID, B::pressureFlagID, block, pdfField, flagField,
+                                      interfaceFlag, real_c(1.0));
+      typename B::Pressure_T pressureOutflow(B::pressureOutflowBoundaryID, B::pressureOutflowFlagID, block, pdfField,
+                                             flagField, interfaceFlag, real_c(1.0));
+      typename B::Outlet_T outlet(B::outletBoundaryID, B::outletFlagID, pdfField, flagField, domainMask);
+      typename B::FreeSlip_T freeSlip(B::freeSlipBoundaryID, B::freeSlipFlagID, pdfField, flagField, domainMask);
+      typename B::MovingObstacle_T curvedLinear(B::movingObstacleBoundaryID, B::movingObstacleFlagID, pdfField,
+                                                flagField, particleField, boundary_->getParticleAccessor(), domainMask,
+                                                *boundary_->getBlockForest(), *block,
+                                                boundary_->getHydrostaticDensityFct());
+
+      return new BoundaryHandling_T("Boundary Handling", flagField, domainMask, noslip, ubb, ubbInflow, pressure,
+                                    pressureOutflow, outlet, freeSlip, curvedLinear);
+   }
+
+   void serialize(IBlock* const block, const BlockDataID& id, mpi::SendBuffer& buffer)
+   {
+      BoundaryHandling_T* const boundaryHandlingPtr = block->getData< BoundaryHandling_T >(id);
+      CellInterval everyCell                        = boundaryHandlingPtr->getFlagField()->xyzSizeWithGhostLayer();
+      boundaryHandlingPtr->pack(buffer, everyCell, true);
+   }
+
+   BoundaryHandling_T* deserialize(IBlock* const block) { return initialize(block); }
+
+   void deserialize(IBlock* const block, const BlockDataID& id, mpi::RecvBuffer& buffer)
+   {
+      BoundaryHandling_T* const boundaryHandlingPtr = block->getData< BoundaryHandling_T >(id);
+      CellInterval everyCell                        = boundaryHandlingPtr->getFlagField()->xyzSizeWithGhostLayer();
+      boundaryHandlingPtr->unpack(buffer, everyCell, true);
+   }
+
+ private:
+   const AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >* boundary_;
+}; // class BoundaryBlockDataHandling
+
+// helper function wrapper for adding the flag field to the block storage; since the input parameter for an
+// initialization function in field::addFlagFieldToStorage() is a std::function<void(FlagField_T*,IBlock* const)>, we
+// need a function wrapper that has both these input parameters; as FlagInfo< FlagField_T >::registerFlags() does not
+// have both of these input parameters, a function wrapper with both input parameters is created and the second input
+// parameter is simply ignored inside the function wrapper
+template< typename FlagField_T >
+void flagFieldInitFunction(FlagField_T* flagField, IBlock* const, const Set< field::FlagUID >& obstacleIDs,
+                           const Set< field::FlagUID >& outflowIDs, const Set< field::FlagUID >& inflowIDs,
+                           const Set< field::FlagUID >& freeSlipIDs)
+{
+   // register flags in the flag field
+   walberla::free_surface::FlagInfo< FlagField_T >::registerFlags(flagField, obstacleIDs, outflowIDs, inflowIDs,
+                                                                  freeSlipIDs);
+}
+
+} // namespace internal
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::AntidunesBoundaryHandling(
+   const std::shared_ptr< StructuredBlockForest >& blockForest, BlockDataID pdfFieldID, BlockDataID fillLevelID,
+   BlockDataID particleFieldID, const shared_ptr< ParticleAccessor_T >& ac,
+   std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct)
+   : blockForest_(blockForest), pdfFieldID_(pdfFieldID), fillFieldID_(fillLevelID), particleFieldID_(particleFieldID),
+     ac_(ac), hydrostaticDensityFct_(std::move(hydrostaticDensityFct)), comm_(blockForest)
+{
+   // initialize obstacleIDs
+   Set< FlagUID > obstacleIDs;
+   obstacleIDs += noSlipFlagID;
+   obstacleIDs += ubbFlagID;
+   obstacleIDs += ubbInflowFlagID;
+   obstacleIDs += pressureFlagID;
+   obstacleIDs += pressureOutflowFlagID;
+   obstacleIDs += freeSlipFlagID;
+   obstacleIDs += outletFlagID;
+   obstacleIDs += movingObstacleFlagID;
+
+   // initialize outflowIDs
+   Set< FlagUID > outflowIDs;
+   outflowIDs += pressureOutflowFlagID;
+   outflowIDs += outletFlagID;
+
+   // initialize outflowIDs
+   Set< FlagUID > inflowIDs;
+   inflowIDs += ubbInflowFlagID;
+
+   // initialize freeSlipIDs
+   Set< FlagUID > freeSlipIDs;
+   freeSlipIDs += freeSlipFlagID;
+
+   // create callable function wrapper with input arguments 1 and 2 unset, whereas arguments 3, 4 and 5 are set to be
+   // obstacleIDs, outflowIDs, and inflowIDs, respectively; this is necessary for field::addFlagFieldToStorage()
+   auto ffInitFunc = std::bind(internal::flagFieldInitFunction< FlagField_T >, std::placeholders::_1,
+                               std::placeholders::_2, obstacleIDs, outflowIDs, inflowIDs, freeSlipIDs);
+
+   // IMPORTANT REMARK: The flag field needs two ghost layers because of function advectMass(). There, the flags of all
+   // D3Q* neighbors are determined for each cell, including cells in the first ghost layer. Therefore, all D3Q*
+   // neighbors of the first ghost layer must be accessible. This requires a second ghost layer.
+   flagFieldID_ = field::addFlagFieldToStorage< FlagField_T >(blockForest, "Flags", uint_c(2), true, ffInitFunc);
+
+   // create FlagInfo
+   flagInfo_ = walberla::free_surface::FlagInfo< FlagField_T >(obstacleIDs, outflowIDs, inflowIDs, freeSlipIDs);
+   WALBERLA_ASSERT(flagInfo_.isConsistentAcrossBlocksAndProcesses(blockForest, flagFieldID_));
+
+   // add boundary handling to blockForest
+   handlingID_ = blockForest_->addBlockData(
+      std::make_shared<
+         internal::BoundaryBlockDataHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T > >(this),
+      "Boundary Handling");
+
+   // create communication object with fill level field, since fill levels determine the flags during the simulation
+   comm_.addPackInfo(std::make_shared< field::communication::PackInfo< ScalarField_T > >(fillFieldID_));
+}
+
+// define IDs (static const variables)
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::noSlipFlagID =
+      field::FlagUID("NoSlip");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbFlagID =
+      field::FlagUID("UBB");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbInflowFlagID =
+      field::FlagUID("UBB_Inflow");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureFlagID =
+      field::FlagUID("Pressure");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureOutflowFlagID =
+      field::FlagUID("PressureOutflow");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::outletFlagID =
+      field::FlagUID("Outlet");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::freeSlipFlagID =
+      field::FlagUID("FreeSlip");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::movingObstacleFlagID =
+      field::FlagUID("MovingObstacle");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::noSlipBoundaryID =
+      BoundaryUID("NoSlip");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbBoundaryID =
+      BoundaryUID("UBB");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbInflowBoundaryID =
+      BoundaryUID("UBB_Inflow");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureBoundaryID =
+      BoundaryUID("Pressure");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                             ParticleAccessor_T >::pressureOutflowBoundaryID =
+   BoundaryUID("PressureOutflow");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::outletBoundaryID =
+      BoundaryUID("Outlet");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::freeSlipBoundaryID =
+      BoundaryUID("FreeSlip");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                             ParticleAccessor_T >::movingObstacleBoundaryID =
+   BoundaryUID("MovingObstacle");
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+geometry::initializer::InitializationManager
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::getInitManager()
+{
+   using namespace geometry::initializer;
+
+   InitializationManager initManager(blockForest_->getBlockStorage());
+
+   // define initializers
+   auto cellIntvInit = std::make_shared< BoundaryFromCellInterval< BoundaryHandling_T > >(*blockForest_, handlingID_);
+   auto borderInit   = std::make_shared< BoundaryFromDomainBorder< BoundaryHandling_T > >(*blockForest_, handlingID_);
+   auto imgInit =
+      std::make_shared< BoundaryFromImage< BoundaryHandling_T, geometry::GrayScaleImage > >(*blockForest_, handlingID_);
+   auto bodyInit = std::make_shared< OverlapFieldFromBody >(*blockForest_, fillFieldID_);
+
+   // register initializers
+   initManager.registerInitializer("CellInterval", cellIntvInit);
+   initManager.registerInitializer("Border", borderInit);
+   initManager.registerInitializer("Image", imgInit);
+   initManager.registerInitializer("Body", bodyInit);
+
+   return initManager;
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::initFromConfig(
+   const Config::BlockHandle& configBlock)
+{
+   // initialize from config file
+   getInitManager().init(configBlock);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+template< typename Body_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::addFreeSurfaceObject(
+   const Body_T& body, bool addOrSubtract)
+{
+   geometry::initializer::OverlapFieldFromBody(*blockForest_, fillFieldID_).init(body, addOrSubtract);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipAtBorder(
+   stencil::Direction d, cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.init(noSlipFlagID, d, wallDistance);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipAtAllBorders(
+   cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.initAllBorders(noSlipFlagID, wallDistance);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipInCell(
+   const Cell& globalCell)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+
+      // transform cell in global coordinates to cell in block local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+
+      handling->forceBoundary(noSlipFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2]);
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setFreeSlipAtBorder(
+   stencil::Direction d, cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.init(freeSlipFlagID, d, wallDistance);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                ParticleAccessor_T >::setFreeSlipAtAllBorders(cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.initAllBorders(freeSlipFlagID, wallDistance);
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setFreeSlipInCell(
+   const Cell& globalCell)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+
+      // transform cell in global coordinates to cell in block local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+
+      handling->forceBoundary(freeSlipFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2]);
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setPressure(
+   real_t density)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureFlagID));
+      pressure.setLatticeDensity(density);
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setUBBInCell(
+   const Cell& globalCell, const Vector3< real_t >& velocity)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+
+      typename UBB_Inflow_T::Velocity ubbVel(velocity);
+
+      // transform cell in global coordinates to cell in block-local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+
+      // get block cell bounding box to check if cell is contained in block
+      CellInterval blockCellBB = blockForest_->getBlockCellBB(*blockIt);
+
+      // flag field has two ghost layers so blockCellBB is actually larger than returned; this is relevant for setups
+      // where the UBB is set in a ghost layer cell
+      blockCellBB.expand(cell_idx_c(2));
+
+      if (blockCellBB.contains(globalCell))
+      {
+         handling->forceBoundary(ubbFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2], ubbVel);
+      }
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setInflowInCell(
+   const Cell& globalCell, const Vector3< real_t >& velocity)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+
+      typename UBB_Inflow_T::Velocity ubbVel(velocity);
+
+      // transform cell in global coordinates to cell in block-local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+
+      // get block cell bounding box to check if cell is contained in block
+      CellInterval blockCellBB = blockForest_->getBlockCellBB(*blockIt);
+
+      // flag field has two ghost layers so blockCellBB is actually larger than returned; this is relevant for setups
+      // where the UBB is set in a ghost layer cell
+      blockCellBB.expand(cell_idx_c(2));
+
+      if (blockCellBB.contains(globalCell))
+      {
+         handling->forceBoundary(ubbInflowFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2], ubbVel);
+      }
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setPressureOutflow(
+   real_t density)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureOutflowFlagID));
+      pressure.setLatticeDensity(density);
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::enableBubbleOutflow(
+   BubbleModelBase* bubbleModel)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+
+      // get pressure from boundary handling
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureFlagID));
+      Pressure_T& pressureOutflow =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureOutflowFlagID));
+
+      // set pressure in bubble model
+      pressure.setBubbleModel(bubbleModel);
+      pressureOutflow.setBubbleModel(bubbleModel);
+   }
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+Vector3< bool > AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                           ParticleAccessor_T >::isObstacleInGlobalGhostLayer()
+{
+   Vector3< bool > isObstacleInGlobalGhostLayer(false, false, false);
+
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      const FlagField_T* const flagField = blockIt->template getData< const FlagField_T >(flagFieldID_);
+
+      const CellInterval domainCellBB = blockForest_->getDomainCellBB();
+
+      // disable OpenMP such that loop termination works correctly
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ_OMP(flagField, uint_c(1), omp critical, {
+         // get cell in global coordinates
+         Cell globalCell = Cell(x, y, z);
+         blockForest_->transformBlockLocalToGlobalCell(globalCell, *blockIt);
+
+         // check if the current cell is located in a global ghost layer
+         const bool isCellInGlobalGhostLayerX =
+            globalCell[0] < domainCellBB.xMin() || globalCell[0] > domainCellBB.xMax();
+
+         const bool isCellInGlobalGhostLayerY =
+            globalCell[1] < domainCellBB.yMin() || globalCell[1] > domainCellBB.yMax();
+
+         const bool isCellInGlobalGhostLayerZ =
+            globalCell[2] < domainCellBB.zMin() || globalCell[2] > domainCellBB.zMax();
+
+         // skip corners, as they do not influence periodic communication
+         if ((isCellInGlobalGhostLayerX && (isCellInGlobalGhostLayerY || isCellInGlobalGhostLayerZ)) ||
+             (isCellInGlobalGhostLayerY && isCellInGlobalGhostLayerZ))
+         {
+            continue;
+         }
+
+         if (!isObstacleInGlobalGhostLayer[0] && isCellInGlobalGhostLayerX &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[0] = true;
+         }
+
+         if (!isObstacleInGlobalGhostLayer[1] && isCellInGlobalGhostLayerY &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[1] = true;
+         }
+
+         if (!isObstacleInGlobalGhostLayer[2] && isCellInGlobalGhostLayerZ &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[2] = true;
+         }
+
+         if (isObstacleInGlobalGhostLayer[0] && isObstacleInGlobalGhostLayer[1] && isObstacleInGlobalGhostLayer[2])
+         {
+            break; // there is no need to check other cells on this block
+         }
+      }) // WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ_OMP
+   }
+
+   mpi::allReduceInplace(isObstacleInGlobalGhostLayer, mpi::LOGICAL_OR);
+
+   return isObstacleInGlobalGhostLayer;
+}
+
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                ParticleAccessor_T >::initFlagsFromFillLevel()
+{
+   const Vector3< bool > isObstacleInGlobalGhostLayer = this->isObstacleInGlobalGhostLayer();
+
+   WALBERLA_ROOT_SECTION()
+   {
+      if ((blockForest_->isXPeriodic() && isObstacleInGlobalGhostLayer[0]) ||
+          (blockForest_->isYPeriodic() && isObstacleInGlobalGhostLayer[1]) ||
+          (blockForest_->isZPeriodic() && isObstacleInGlobalGhostLayer[2]))
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT(
+            "WARNING: An obstacle cell is located in a global outermost ghost layer in a periodic "
+            "direction. Be aware that due to periodicity, this obstacle cell will be "
+            "overwritten during communication.");
+      }
+   }
+
+   // communicate fill level (neighborhood is used in initialization)
+   comm_();
+
+   // initialize fill level in boundaries (with value 1), i.e., obstacles such that the bubble model does not detect
+   // obstacles as gas cells
+   walberla::free_surface::initFillLevelsInBoundaries< BoundaryHandling_T, typename LatticeModel_T::Stencil,
+                                                       ScalarField_T >(blockForest_, handlingID_, fillFieldID_);
+
+   // clear and initialize flags in every cell according to the fill level
+   walberla::free_surface::initFlagsFromFillLevels< BoundaryHandling_T, typename LatticeModel_T::Stencil, FlagField_T,
+                                                    const ScalarField_T >(blockForest_, flagInfo_, handlingID_,
+                                                                          fillFieldID_);
+}
+
+} // namespace free_surface
+} // namespace antidunes
+} // namespace walberla
diff --git a/apps/showcases/Antidunes/AntidunesLatticeModelGeneration.py b/apps/showcases/Antidunes/AntidunesLatticeModelGeneration.py
new file mode 100644
index 0000000000000000000000000000000000000000..1590611076eb94519eca728168974c132bbcc6df
--- /dev/null
+++ b/apps/showcases/Antidunes/AntidunesLatticeModelGeneration.py
@@ -0,0 +1,36 @@
+import sympy as sp
+import pystencils as ps
+from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, create_lb_collision_rule
+from lbmpy.enums import ForceModel, Method, Stencil
+from lbmpy.stencils import LBStencil
+
+from pystencils_walberla import CodeGeneration
+from lbmpy_walberla import generate_lattice_model
+
+with CodeGeneration() as ctx:
+    # general parameters
+    layout = 'fzyx'
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    stencil = LBStencil(Stencil.D3Q27)
+    omega = sp.Symbol('omega')
+    force_field = ps.fields(f"force(3): {data_type}[3D]", layout=layout)
+
+    # method definition
+    lbm_config = LBMConfig(stencil=stencil,
+                           method=Method.CUMULANT,
+                           relaxation_rate=omega,
+                           compressible=True,
+                           force=force_field,
+                           zero_centered=False,
+                           streaming_pattern='pull',    # free surface implementation only works with pull pattern
+                           galilean_correction=True)
+
+    # optimizations to be used by the code generator
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              field_layout=layout)
+
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config,
+                                              lbm_optimisation=lbm_opt)
+
+    generate_lattice_model(ctx, "AntidunesLatticeModel", collision_rule, field_layout=layout)
diff --git a/apps/showcases/Antidunes/BedGeneration.cpp b/apps/showcases/Antidunes/BedGeneration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..baac22f4d82bcd9ed161df5136c9919a13d83ac1
--- /dev/null
+++ b/apps/showcases/Antidunes/BedGeneration.cpp
@@ -0,0 +1,265 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   BedGeneration.cpp
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/grid_generator/HCPIterator.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/math/Random.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/LinkedCells.h"
+#include "mesa_pd/data/ParticleAccessorWithBaseShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/AssocToBlock.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/ExplicitEuler.h"
+#include "mesa_pd/kernel/InsertParticleIntoLinkedCells.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncNextNeighborsBlockForest.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "vtk/VTKOutput.h"
+
+#include <mesa_pd/mpi/notifications/ForceTorqueNotification.h>
+
+#include "Utility.h"
+
+namespace walberla
+{
+namespace antidunes
+{
+
+using namespace mesa_pd;
+
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   walberla::mpi::MPIManager::instance()->useWorldComm();
+
+   // Config
+   auto cfg = env.config();
+   if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
+   WALBERLA_LOG_INFO_ON_ROOT(*cfg);
+   const Config::BlockHandle bedGenerationConf = cfg->getBlock("BedGeneration");
+
+   Vec3 domainSize_SI             = bedGenerationConf.getParameter< Vec3 >("domainSize_SI");
+   Vector3< int > blocks          = bedGenerationConf.getParameter< Vector3< int > >("blocks");
+   real_t diameter_SI             = bedGenerationConf.getParameter< real_t >("diameter_SI");
+   real_t gravity_SI              = bedGenerationConf.getParameter< real_t >("gravity_SI");
+   real_t densityFluid_SI         = bedGenerationConf.getParameter< real_t >("densityFluid_SI");
+   real_t densityParticle_SI      = bedGenerationConf.getParameter< real_t >("densityParticle_SI");
+   real_t generationSpacing_SI    = bedGenerationConf.getParameter< real_t >("generationSpacing_SI");
+   real_t initialVelocity_SI      = bedGenerationConf.getParameter< real_t >("initialVelocity_SI");
+   real_t dt_SI                   = bedGenerationConf.getParameter< real_t >("dt_SI");
+   real_t frictionCoefficient     = bedGenerationConf.getParameter< real_t >("frictionCoefficient");
+   real_t restitutionCoefficient  = bedGenerationConf.getParameter< real_t >("restitutionCoefficient");
+   real_t collisionTime_SI        = bedGenerationConf.getParameter< real_t >("collisionTime_SI");
+   real_t poissonsRatio           = bedGenerationConf.getParameter< real_t >("poissonsRatio");
+   uint_t timeSteps               = bedGenerationConf.getParameter< uint_t >("timeSteps");
+   uint_t visSpacing              = bedGenerationConf.getParameter< uint_t >("visSpacing");
+   std::string outFileName        = bedGenerationConf.getParameter< std::string >("outFileName");
+   bool denseBottomLayer          = bedGenerationConf.getParameter< bool >("denseBottomLayer");
+   real_t bottomLayerOffsetFactor = bedGenerationConf.getParameter< real_t >("bottomLayerOffsetFactor");
+
+   // BlockForest
+   math::AABB simulationDomain_SI(real_t(0.0), real_t(0.0), real_t(0.0), domainSize_SI[0], domainSize_SI[1],
+                                  domainSize_SI[2]);
+   Vector3< bool > isPeriodic{ true, true, false };
+
+   shared_ptr< BlockForest > forest = blockforest::createBlockForest(simulationDomain_SI, blocks, isPeriodic);
+   auto domain                      = std::make_shared< mesa_pd::domain::BlockForestDomain >(forest);
+
+   // MesaPD data structures
+   auto ps = std::make_shared< data::ParticleStorage >(1);
+   data::ParticleAccessorWithBaseShape accessor(ps);
+
+   // Init spheres
+   real_t minDiameter_SI = diameter_SI * real_t(0.9);
+   real_t maxDiameter_SI = diameter_SI * real_t(1.1);
+
+   math::AABB generationDomain_SI(simulationDomain_SI.xMin(), simulationDomain_SI.yMin(),
+                                  simulationDomain_SI.zMin() + diameter_SI, simulationDomain_SI.xMax(),
+                                  simulationDomain_SI.yMax(), simulationDomain_SI.zMax());
+
+   for (auto pt :
+        grid_generator::SCGrid(generationDomain_SI, Vec3(generationSpacing_SI) * real_c(0.5), generationSpacing_SI))
+   {
+      auto diameter = math::realRandom< real_t >(minDiameter_SI, maxDiameter_SI);
+
+      if (!domain->isContainedInLocalSubdomain(pt, real_t(0))) continue;
+      auto p                       = ps->create();
+      p->getPositionRef()          = pt;
+      p->getInteractionRadiusRef() = diameter * real_t(0.5);
+      p->getBaseShapeRef()         = std::make_shared< data::Sphere >(p->getInteractionRadius());
+      p->getBaseShapeRef()->updateMassAndInertia(densityParticle_SI);
+
+      p->setLinearVelocity(Vec3(real_t(0.1) * math::realRandom(-initialVelocity_SI, initialVelocity_SI),
+                                real_t(0.1) * math::realRandom(-initialVelocity_SI, initialVelocity_SI),
+                                -initialVelocity_SI));
+      p->getOwnerRef() = walberla::mpi::MPIManager::instance()->rank();
+      p->getTypeRef()  = 0;
+   }
+
+   math::AABB bottomLayerDomain_SI(simulationDomain_SI.xMin(), simulationDomain_SI.yMin(), simulationDomain_SI.zMin(),
+                                   simulationDomain_SI.xMax(), simulationDomain_SI.yMax(), diameter_SI);
+
+   real_t bottomLayerSpacing = bottomLayerDomain_SI.xSize() / std::floor(bottomLayerDomain_SI.xSize() / diameter_SI);
+   real_t bottomLayerYStretchFactor =
+      real_t((bottomLayerDomain_SI.ySize() / (sqrt(3_r) * bottomLayerSpacing)) /
+             std::floor(bottomLayerDomain_SI.ySize() / (sqrt(3_r) * bottomLayerSpacing)));
+   if (denseBottomLayer)
+   {
+      bottomLayerYStretchFactor = real_t((bottomLayerDomain_SI.ySize() / (sqrt(3_r) * bottomLayerSpacing)) /
+                                         std::ceil(bottomLayerDomain_SI.ySize() / (sqrt(3_r) * bottomLayerSpacing)));
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(bottomLayerSpacing << " " << bottomLayerYStretchFactor);
+   for (auto pt : grid_generator::HCPGrid(bottomLayerDomain_SI, Vec3(diameter_SI) * real_c(0.5), bottomLayerSpacing))
+   {
+      auto diameter = math::realRandom< real_t >(minDiameter_SI, maxDiameter_SI);
+      auto zCoord   = math::realRandom< real_t >(real_t(1e-10), diameter_SI * bottomLayerOffsetFactor);
+      Vec3 position{ pt[0], pt[1] * bottomLayerYStretchFactor, zCoord };
+
+      if (!domain->isContainedInLocalSubdomain(position, real_t(0))) continue;
+      auto p                       = ps->create();
+      p->getPositionRef()          = position;
+      p->getInteractionRadiusRef() = diameter * real_t(0.5);
+      p->getBaseShapeRef()         = std::make_shared< data::Sphere >(p->getInteractionRadius());
+      p->getBaseShapeRef()->updateMassAndInertia(densityParticle_SI);
+
+      p->getOwnerRef() = walberla::mpi::MPIManager::instance()->rank();
+      p->getTypeRef()  = 0;
+      mesa_pd::data::particle_flags::set(p->getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+   }
+
+   createPlane(*ps, simulationDomain_SI.minCorner(), Vec3(real_t(0), real_t(0), real_t(1)));
+   createPlane(*ps, simulationDomain_SI.maxCorner(), Vec3(real_t(0), real_t(0), real_t(-1)));
+
+   // VTK
+   auto vtkDomainOutput =
+      walberla::vtk::createVTKOutput_DomainDecomposition(forest, "domain_decomposition", 1, "vtk", "simulation_step");
+   vtkDomainOutput->write();
+
+   auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+   particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+   particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+   particleVtkOutput->setParticleSelector([](const data::ParticleStorage::iterator& pIt) {
+      using namespace walberla::mesa_pd::data::particle_flags;
+      return (pIt->getBaseShape()->getShapeType() == data::Sphere::SHAPE_TYPE) && !isSet(pIt->getFlags(), GHOST);
+   });
+   auto vtkWriter = walberla::vtk::createVTKOutput_PointData(particleVtkOutput, "Particles", 1, "vtk",
+                                                             "simulation_step", false, false);
+
+   // Init kernels
+   kernel::ExplicitEuler explicitEulerWithShape(dt_SI);
+   kernel::LinearSpringDashpot dem(1);
+   dem.setFrictionCoefficientDynamic(0, 0, frictionCoefficient);
+   real_t kappa = real_t(2) * (real_t(1) - poissonsRatio) / (real_t(2) - poissonsRatio); // from Thornton et al
+
+   kernel::AssocToBlock assoc(forest);
+   mesa_pd::mpi::ReduceProperty RP;
+   mesa_pd::mpi::SyncNextNeighborsBlockForest SNN;
+
+   ps->forEachParticle(false, kernel::SelectLocal(), accessor, assoc, accessor);
+
+   // initial sync
+   SNN(*ps, forest, domain);
+
+   real_t averageVelocity     = real_t(0);
+   uint_t currentNumParticles = 0;
+   real_t maxVelocity         = real_t(0);
+   real_t maxHeight           = real_t(0);
+
+   real_t linkedCellWidth = 1.01_r * maxDiameter_SI;
+   data::LinkedCells linkedCells(domain->getUnionOfLocalAABBs().getExtended(linkedCellWidth), linkedCellWidth);
+   kernel::InsertParticleIntoLinkedCells ipilc;
+
+   for (uint_t i = 0; i < timeSteps; ++i)
+   {
+      if (i % visSpacing == 0) { vtkWriter->write(); }
+
+      ps->forEachParticle(false, kernel::SelectLocal(), accessor, assoc, accessor);
+
+      SNN(*ps, forest, domain);
+
+      // gravity - buoyancy
+      ps->forEachParticle(
+         false, kernel::SelectLocal(), accessor,
+         [densityParticle_SI, densityFluid_SI, gravity_SI](const size_t idx, auto& ac) {
+            mesa_pd::addForceAtomic(
+               idx, ac, Vec3(0, 0, -(densityParticle_SI - densityFluid_SI) * ac.getVolume(idx) * gravity_SI));
+         },
+         accessor);
+
+      linkedCells.clear();
+      ps->forEachParticle(false, kernel::SelectAll(), accessor, ipilc, accessor, linkedCells);
+      linkedCells.forEachParticlePairHalf(
+         false, kernel::ExcludeInfiniteInfinite(), accessor,
+         [restitutionCoefficient, collisionTime_SI, kappa, domain, &dem, dt_SI](const size_t idx1, const size_t idx2,
+                                                                                auto& ac) {
+            kernel::DoubleCast double_cast;
+            mesa_pd::mpi::ContactFilter contact_filter;
+            collision_detection::AnalyticContactDetection acd;
+
+            if (double_cast(idx1, idx2, ac, acd, ac))
+            {
+               if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+               {
+                  auto meff = real_t(1) / (ac.getInvMass(idx1) + ac.getInvMass(idx2));
+                  dem.setStiffnessAndDamping(0, 0, restitutionCoefficient, collisionTime_SI, kappa, meff);
+                  dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                      acd.getPenetrationDepth(), dt_SI);
+               }
+            }
+         },
+         accessor);
+
+      RP.operator()< ForceTorqueNotification >(*ps);
+
+      ps->forEachParticle(false, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
+
+      getAverageVelocity(accessor, averageVelocity, maxVelocity, currentNumParticles, maxHeight);
+
+      SNN(*ps, forest, domain);
+
+      if (i % 1000 == 0)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Timestep " << i << " / " << timeSteps << ", average velocity = " << averageVelocity
+                                               << ", max velocity = " << maxVelocity << ", #particles = "
+                                               << currentNumParticles << ", max height = " << maxHeight);
+      }
+   }
+
+   writeSphereInformationToFile(outFileName, *ps, domainSize_SI);
+
+   return EXIT_SUCCESS;
+}
+} // namespace antidunes
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::antidunes::main(argc, argv); }
diff --git a/apps/showcases/Antidunes/BedGeneration.prm b/apps/showcases/Antidunes/BedGeneration.prm
new file mode 100644
index 0000000000000000000000000000000000000000..af2ecf9645a560468350281c27319a140e2d5bc4
--- /dev/null
+++ b/apps/showcases/Antidunes/BedGeneration.prm
@@ -0,0 +1,20 @@
+BedGeneration{
+    domainSize_SI < 0.8, 0.015, 0.2 >;
+    blocks < 3, 3, 1 >;
+    diameter_SI 0.0029;
+    gravity_SI 9.81;
+    densityFluid_SI 1000;
+    densityParticle_SI 2550;
+    generationSpacing_SI 0.005;
+    initialVelocity_SI 1;
+    dt_SI 5e-5;
+    frictionCoefficient 0.5;
+    restitutionCoefficient 0.1;
+    collisionTime_SI 5e-4;
+    poissonsRatio 0.22;
+    timeSteps 10000;
+    visSpacing 100;
+    outFileName spheres_out.dat;
+    denseBottomLayer False;
+    bottomLayerOffsetFactor 1.0;
+}
diff --git a/apps/showcases/Antidunes/CMakeLists.txt b/apps/showcases/Antidunes/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0feea41cc2502dc568f9ee3aa2271e2932760e9
--- /dev/null
+++ b/apps/showcases/Antidunes/CMakeLists.txt
@@ -0,0 +1,16 @@
+waLBerla_link_files_to_builddir( *.prm )
+
+if( WALBERLA_BUILD_WITH_CODEGEN )
+   walberla_generate_target_from_python( NAME      AntidunesLatticeModelGeneration
+                                         FILE      AntidunesLatticeModelGeneration.py
+                                         OUT_FILES AntidunesLatticeModel.cpp AntidunesLatticeModel.h )
+
+   waLBerla_add_executable(NAME     Antidunes
+                           FILES    Antidunes.cpp PIDController.cpp
+                           DEPENDS  blockforest boundary core domain_decomposition field lbm mesa_pd lbm_mesapd_coupling
+                                    postprocessing timeloop vtk AntidunesLatticeModelGeneration)
+endif()
+
+waLBerla_add_executable(NAME     BedGeneration
+        FILES    BedGeneration.cpp
+        DEPENDS  blockforest core domain_decomposition mesa_pd vtk)
diff --git a/apps/showcases/Antidunes/PIDController.cpp b/apps/showcases/Antidunes/PIDController.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83df9f470f30f259a4cdab8073ff4e67ff0f1f86
--- /dev/null
+++ b/apps/showcases/Antidunes/PIDController.cpp
@@ -0,0 +1,115 @@
+//======================================================================================================================
+/*!
+ *  \file   PIDController.cpp
+ */
+//======================================================================================================================
+
+#include "PIDController.h"
+
+#include <algorithm>
+#include <fstream>
+
+using namespace walberla;
+using walberla::real_t;
+
+PIDController::PIDController()
+   : commandVariable_(0), actuatingVariable_(0), proportionalGain_(0), derivateGain_(0), integralGain_(0), maxRamp_(0),
+     minActuatingVariable_(0), maxActuatingVariable_(0), errorIntegral_(0)
+{
+   std::fill(errorHistory_, errorHistory_ + sizeof(errorHistory_) / sizeof(real_t), real_t(0));
+}
+
+PIDController::PIDController(const real_t commandVariable, const real_t initialActuatingVariable,
+                             const real_t proportionalGain, const real_t derivateGain, const real_t integralGain,
+                             const real_t maxRamp, const real_t minActuatingVariable, const real_t maxActuatingVariable)
+   : commandVariable_(commandVariable), actuatingVariable_(initialActuatingVariable),
+     proportionalGain_(proportionalGain), derivateGain_(derivateGain), integralGain_(integralGain), maxRamp_(maxRamp),
+     minActuatingVariable_(minActuatingVariable), maxActuatingVariable_(maxActuatingVariable), errorIntegral_(0)
+{
+   std::fill(errorHistory_, errorHistory_ + sizeof(errorHistory_) / sizeof(real_t), real_t(0));
+
+   if (integralGain_ > real_t(0))
+      errorIntegral_ = initialActuatingVariable / integralGain_;
+   else
+      errorIntegral_ = real_t(0);
+}
+
+PIDController::PIDController(const real_t commandVariable, const real_t initialActuatingVariable,
+                             const real_t proportionalGain, const real_t derivateGain, const real_t integralGain)
+   : commandVariable_(commandVariable), actuatingVariable_(initialActuatingVariable),
+     proportionalGain_(proportionalGain), derivateGain_(derivateGain), integralGain_(integralGain),
+     maxRamp_(std::numeric_limits< real_t >::max()), minActuatingVariable_(-std::numeric_limits< real_t >::max()),
+     maxActuatingVariable_(std::numeric_limits< real_t >::max()), errorIntegral_(0)
+{
+   std::fill(errorHistory_, errorHistory_ + sizeof(errorHistory_) / sizeof(real_t), real_t(0));
+
+   if (integralGain_ > real_t(0))
+      errorIntegral_ = initialActuatingVariable / integralGain_;
+   else
+      errorIntegral_ = real_t(0);
+}
+
+real_t PIDController::update(const real_t controlledVariable)
+{
+   static const real_t ONE_OVER_SIX = real_t(1) / real_t(6);
+   const real_t error               = commandVariable_ - controlledVariable;
+
+   const real_t d =
+      (error + real_t(3) * errorHistory_[0] - real_t(3) * errorHistory_[1] - errorHistory_[2]) * ONE_OVER_SIX;
+   std::rotate(errorHistory_, errorHistory_ + 1, errorHistory_ + sizeof(errorHistory_) / sizeof(real_t));
+   errorHistory_[sizeof(errorHistory_) / sizeof(real_t) - size_t(1)] = error;
+
+   real_t newActuationVariable = proportionalGain_ * error + derivateGain_ * d + integralGain_ * errorIntegral_;
+
+   if (std::fabs(actuatingVariable_ - newActuationVariable) < maxRamp_)
+   {
+      errorIntegral_ += error;
+      newActuationVariable = proportionalGain_ * error + derivateGain_ * d + integralGain_ * errorIntegral_;
+   }
+
+   const real_t maxValue = std::min(actuatingVariable_ + maxRamp_, maxActuatingVariable_);
+   const real_t minValue = std::max(actuatingVariable_ - maxRamp_, minActuatingVariable_);
+
+   actuatingVariable_ = std::min(std::max(minValue, newActuationVariable), maxValue);
+
+   return actuatingVariable_;
+}
+
+void PIDController::writeStateToFile(std::string filename) const
+{
+   std::ofstream file;
+   file.open(filename);
+   file << std::setprecision(16);
+   file << commandVariable_ << std::endl;
+   file << actuatingVariable_ << std::endl;
+   file << proportionalGain_ << std::endl;
+   file << derivateGain_ << std::endl;
+   file << integralGain_ << std::endl;
+   file << maxRamp_ << std::endl;
+   file << minActuatingVariable_ << std::endl;
+   file << maxActuatingVariable_ << std::endl;
+   file << errorHistory_[0] << std::endl;
+   file << errorHistory_[1] << std::endl;
+   file << errorHistory_[2] << std::endl;
+   file << errorIntegral_ << std::endl;
+   file.close();
+}
+
+void PIDController::readStateFromFile(std::string filename)
+{
+   std::ifstream file;
+   file.open(filename);
+   file >> commandVariable_;
+   file >> actuatingVariable_;
+   file >> proportionalGain_;
+   file >> derivateGain_;
+   file >> integralGain_;
+   file >> maxRamp_;
+   file >> minActuatingVariable_;
+   file >> maxActuatingVariable_;
+   file >> errorHistory_[0];
+   file >> errorHistory_[1];
+   file >> errorHistory_[2];
+   file >> errorIntegral_;
+   file.close();
+}
diff --git a/apps/showcases/Antidunes/PIDController.h b/apps/showcases/Antidunes/PIDController.h
new file mode 100644
index 0000000000000000000000000000000000000000..aeae57260688bbeca012a240fac40209e94e2e55
--- /dev/null
+++ b/apps/showcases/Antidunes/PIDController.h
@@ -0,0 +1,51 @@
+//======================================================================================================================
+/*!
+ *  \file   PIDController.h
+ */
+//======================================================================================================================
+
+#pragma once
+
+#include <core/DataTypes.h>
+#include <core/logging/Logging.h>
+
+using namespace walberla;
+using walberla::real_t;
+
+class PIDController
+{
+ public:
+   PIDController();
+
+   PIDController(const real_t commandVariable, const real_t initialActuatingVariable, const real_t proportionalGain,
+                 const real_t derivateGain, const real_t integralGain, const real_t maxRamp,
+                 const real_t minActuatingVariable, const real_t maxActuatingVariable);
+
+   PIDController(const real_t commandVariable, const real_t initialActuatingVariable, const real_t proportionalGain,
+                 const real_t derivateGain, const real_t integralGain);
+
+   real_t update(const real_t controlledVariable);
+
+   real_t getProportionalGain() const { return proportionalGain_; }
+   real_t getDerivateGain() const { return derivateGain_; }
+   real_t getIntegralGain() const { return integralGain_; }
+
+   void writeStateToFile(std::string filename) const;
+
+   void readStateFromFile(std::string filename);
+
+ private:
+   real_t commandVariable_;
+   real_t actuatingVariable_;
+
+   real_t proportionalGain_;
+   real_t derivateGain_;
+   real_t integralGain_;
+
+   real_t maxRamp_;
+   real_t minActuatingVariable_;
+   real_t maxActuatingVariable_;
+
+   real_t errorHistory_[3];
+   real_t errorIntegral_;
+};
diff --git a/apps/showcases/Antidunes/Utility.h b/apps/showcases/Antidunes/Utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3e0e2149a01f26ee097287494d2d982ef9025c3
--- /dev/null
+++ b/apps/showcases/Antidunes/Utility.h
@@ -0,0 +1,472 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   Utility.h
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+
+#include <algorithm>
+#include <core/mpi/Broadcast.h>
+#include <core/mpi/MPITextFile.h>
+#include <core/mpi/Reduce.h>
+#include <functional>
+#include <iterator>
+
+namespace walberla
+{
+namespace antidunes
+{
+
+struct SphereSelector
+{
+   template< typename ParticleAccessor_T >
+   bool inline operator()(const size_t particleIdx, const ParticleAccessor_T& ac) const
+   {
+      static_assert(std::is_base_of< mesa_pd::data::IAccessor, ParticleAccessor_T >::value,
+                    "Provide a valid accessor as template");
+      return ac.getBaseShape(particleIdx)->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE;
+   }
+};
+
+void renameFile(const std::string& oldName, const std::string& newName)
+{
+   int result = std::rename(oldName.c_str(), newName.c_str());
+   if (result != 0)
+      WALBERLA_LOG_WARNING_ON_ROOT("Could not rename file " << oldName << " to " << newName << " with error code "
+                                                            << result);
+}
+
+void write2DVectorToFile(const std::vector< real_t >& vec, uint_t len1, uint_t len2, std::string filename)
+{
+   std::ofstream file;
+   file.open(filename.c_str());
+   file.precision(5);
+
+   file << "# " << len1 << " " << len2 << "\n";
+
+   for (uint_t j = uint_t(0); j < len2; ++j)
+   {
+      for (uint_t i = uint_t(0); i < len1; ++i)
+      {
+         file << vec[i + j * len1] << "\n";
+      }
+   }
+   file.close();
+}
+
+template< typename ParticleAccessor_T >
+class BedloadTransportEvaluator
+{
+ public:
+   BedloadTransportEvaluator(const shared_ptr< ParticleAccessor_T >& ac, real_t normalizationFactor,
+                             uint_t numParticles)
+      : ac_(ac), normalizationFactor_(normalizationFactor), numParticles_(numParticles)
+   {}
+
+   void operator()()
+   {
+      real_t transportRate(real_t(0));
+      real_t velocity(real_t(0));
+
+      for (uint_t i = uint_t(0); i < ac_->size(); ++i)
+      {
+         if (!isSet(ac_->getFlags(i), mesa_pd::data::particle_flags::GHOST) &&
+             !isSet(ac_->getFlags(i), mesa_pd::data::particle_flags::GLOBAL))
+         {
+            auto velX = ac_->getLinearVelocity(i)[0];
+            transportRate += velX * ac_->getVolume(i);
+            velocity += velX;
+         }
+      }
+
+      // only reduce to root
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::reduceInplace(transportRate, mpi::SUM);
+         mpi::reduceInplace(velocity, mpi::SUM);
+      }
+
+      avgTransportRate_ = transportRate * normalizationFactor_;
+      averageVelocity_  = velocity / real_c(numParticles_);
+   }
+
+   // sum_i V_p,i * u_p,i / (L*W)
+   real_t getTransportRate() const { return avgTransportRate_; }
+
+   real_t getAverageVelocity() const { return averageVelocity_; }
+
+ private:
+   shared_ptr< ParticleAccessor_T > ac_;
+   real_t normalizationFactor_;
+   uint_t numParticles_;
+   real_t averageVelocity_;
+   real_t avgTransportRate_;
+};
+
+template< typename ParticleAccessor_T >
+Vector3< real_t > getTotalHydrodynamicForceOnParticles(const shared_ptr< ParticleAccessor_T >& ac)
+{
+   Vector3< real_t > totalHydrodynamicForce(0_r);
+   for (uint_t i = uint_t(0); i < ac->size(); ++i)
+   {
+      if (!isSet(ac->getFlags(i), mesa_pd::data::particle_flags::GHOST) &&
+          !isSet(ac->getFlags(i), mesa_pd::data::particle_flags::GLOBAL))
+      {
+         totalHydrodynamicForce += ac->getHydrodynamicForce(i);
+      }
+   }
+   // only reduce to root
+   mpi::reduceInplace(totalHydrodynamicForce, mpi::SUM);
+
+   return totalHydrodynamicForce;
+}
+
+// evaluates slices of solid volume fraction and fill level
+template< typename PdfField_T, typename AntidunesBoundaryHandling_T, typename FlagField_T, typename ScalarField_T >
+class AverageDataSliceEvaluator
+{
+ public:
+   AverageDataSliceEvaluator(const shared_ptr< StructuredBlockStorage >& blocks, const ConstBlockDataID& flagFieldID,
+                             const ConstBlockDataID& fillFieldID, const ConstBlockDataID& pdfFieldID)
+      : blocks_(blocks), flagFieldID_(flagFieldID), fillFieldID_(fillFieldID), pdfFieldID_(pdfFieldID)
+   {
+      xlen_ = blocks_->getNumberOfXCells();
+      ylen_ = blocks_->getNumberOfYCells();
+      zlen_ = blocks_->getNumberOfZCells();
+
+      x_z_SolidVolumeFraction_ = std::vector< real_t >(xlen_ * zlen_, real_t(0));
+      x_z_FillLevel_           = std::vector< real_t >(xlen_ * zlen_, real_t(0));
+      x_z_VelocityX_           = std::vector< real_t >(xlen_ * zlen_, real_t(0));
+      x_z_FluidCellCount_      = std::vector< uint_t >(xlen_ * zlen_, 0);
+      maxFluidZPos_            = uint_t(0);
+   }
+
+   void operator()()
+   {
+      // erase data
+      std::fill(x_z_SolidVolumeFraction_.begin(), x_z_SolidVolumeFraction_.end(), real_t(0));
+      std::fill(x_z_FillLevel_.begin(), x_z_FillLevel_.end(), real_t(0));
+      std::fill(x_z_VelocityX_.begin(), x_z_VelocityX_.end(), real_t(0));
+      std::fill(x_z_FluidCellCount_.begin(), x_z_FluidCellCount_.end(), 0);
+
+      // fill contributions
+      for (auto block = blocks_->begin(); block != blocks_->end(); ++block)
+      {
+         const PdfField_T* const pdfField     = block->getData< const PdfField_T >(pdfFieldID_);
+         const FlagField_T* const flagField   = block->getData< const FlagField_T >(flagFieldID_);
+         const ScalarField_T* const fillField = block->getData< const ScalarField_T >(fillFieldID_);
+
+         const auto solidMO     = flagField->getFlag(AntidunesBoundaryHandling_T::movingObstacleFlagID);
+         const auto solidNoSlip = flagField->getFlag(AntidunesBoundaryHandling_T::noSlipFlagID);
+
+         CellInterval xyz = flagField->xyzSize();
+         Cell globalCell;
+
+         maxFluidZPos_ = uint_t(0);
+
+         // iterate all (inner) cells in the field
+         for (auto cell = xyz.begin(); cell != xyz.end(); ++cell)
+         {
+            blocks_->transformBlockLocalToGlobalCell(globalCell, *block, *cell);
+            auto entryIdx = uint_c(globalCell.x()) + uint_c(globalCell.z()) * xlen_;
+            if (flagField->isFlagSet(*cell, solidMO) || flagField->isFlagSet(*cell, solidNoSlip))
+            {
+               x_z_SolidVolumeFraction_[entryIdx] += real_t(1);
+               x_z_FillLevel_[entryIdx] += real_t(1);
+            }
+            else
+            {
+               auto fillLevel = fillField->get(*cell);
+               x_z_FillLevel_[entryIdx] += fillLevel;
+               if (fillLevel > 0_r)
+               {
+                  x_z_VelocityX_[entryIdx] += pdfField->getVelocity(*cell)[0];
+                  ++x_z_FluidCellCount_[entryIdx];
+
+                  maxFluidZPos_ = std::max(uint_t(globalCell.z()), maxFluidZPos_);
+               }
+            }
+         }
+      }
+
+      // reduce this information to the root process
+      mpi::reduceInplace(x_z_SolidVolumeFraction_, mpi::SUM);
+      mpi::reduceInplace(x_z_FillLevel_, mpi::SUM);
+      mpi::reduceInplace(x_z_VelocityX_, mpi::SUM);
+      mpi::reduceInplace(x_z_FluidCellCount_, mpi::SUM);
+      mpi::reduceInplace(maxFluidZPos_, mpi::MAX);
+
+      // normalize
+      for (uint_t i = 0; i < x_z_VelocityX_.size(); ++i)
+      {
+         if (x_z_FluidCellCount_[i] > 0) x_z_VelocityX_[i] /= real_c(x_z_FluidCellCount_[i]);
+      }
+      real_t invNumYCells = 1_r / real_c(ylen_);
+      std::for_each(x_z_SolidVolumeFraction_.begin(), x_z_SolidVolumeFraction_.end(),
+                    [invNumYCells](real_t& n) { n *= invNumYCells; });
+      std::for_each(x_z_FillLevel_.begin(), x_z_FillLevel_.end(), [invNumYCells](real_t& n) { n *= invNumYCells; });
+
+      // note: only root process has the complete information!
+   }
+
+   std::vector< real_t >& getSolidVolumeFractionVector() { return x_z_SolidVolumeFraction_; }
+   std::vector< real_t >& getFillLevelVector() { return x_z_FillLevel_; }
+   std::vector< real_t >& getVelocityXVector() { return x_z_VelocityX_; }
+   std::vector< uint_t >& getFluidCellCountVector() { return x_z_FluidCellCount_; }
+   uint_t getMaxFluidZPos() { return maxFluidZPos_; }
+
+   uint_t getXLen() const { return xlen_; }
+   uint_t getYLen() const { return ylen_; }
+   uint_t getZLen() const { return zlen_; }
+
+ private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const ConstBlockDataID flagFieldID_;
+   const ConstBlockDataID fillFieldID_;
+   const ConstBlockDataID pdfFieldID_;
+
+   uint_t xlen_;
+   uint_t ylen_;
+   uint_t zlen_;
+   uint_t maxFluidZPos_;
+   std::vector< real_t > x_z_SolidVolumeFraction_;
+   std::vector< real_t > x_z_FillLevel_;
+   std::vector< real_t > x_z_VelocityX_;
+   std::vector< uint_t > x_z_FluidCellCount_;
+};
+
+void writeSphereInformationToFile(const std::string& filename, walberla::mesa_pd::data::ParticleStorage& ps,
+                                  Vector3< real_t >& domainSize, int precision = 12)
+{
+   std::ostringstream ossData;
+   ossData << std::setprecision(precision);
+
+   WALBERLA_ROOT_SECTION() { ossData << domainSize[0] << " " << domainSize[1] << " " << domainSize[2] << "\n"; }
+
+   for (auto pIt : ps)
+   {
+      using namespace walberla::mesa_pd::data;
+      if (pIt->getBaseShape()->getShapeType() != Sphere::SHAPE_TYPE) continue;
+      using namespace walberla::mesa_pd::data::particle_flags;
+      if (isSet(pIt->getFlags(), GHOST)) continue;
+      auto sp = static_cast< Sphere* >(pIt->getBaseShape().get());
+
+      auto position = pIt->getPosition();
+
+      ossData << pIt->getUid() << " " << position[0] << " " << position[1] << " " << position[2] << " "
+              << sp->getRadius() << '\n';
+   }
+
+   walberla::mpi::writeMPITextFile(filename, ossData.str());
+}
+
+void getAvgDiameterScalingFactor(const std::string& filename, const Vector3< uint_t >& domainSize,
+                                 const uint_t bedCopiesInX, const uint_t bedCopiesInY, real_t& avgDiameter,
+                                 real_t& scalingFactor)
+{
+   using namespace walberla;
+
+   std::string textFile;
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ifstream t(filename.c_str());
+      if (!t) { WALBERLA_ABORT("Invalid input file " << filename << "\n"); }
+      std::stringstream buffer;
+      buffer << t.rdbuf();
+      textFile = buffer.str();
+
+      std::istringstream fileIss(textFile);
+      std::string line;
+
+      // first line contains generation domain sizes
+      std::getline(fileIss, line);
+      Vector3< real_t > generationDomainSize_SI(0_r);
+      std::istringstream firstLine(line);
+      firstLine >> generationDomainSize_SI[0] >> generationDomainSize_SI[1] >> generationDomainSize_SI[2];
+
+      real_t diameter_SI  = 0.0;
+      uint_t numParticles = 0;
+      while (std::getline(fileIss, line))
+      {
+         std::istringstream iss(line);
+
+         mesa_pd::data::ParticleStorage::uid_type uID;
+         mesa_pd::data::ParticleStorage::position_type pos(0_r);
+         walberla::real_t radius = 0;
+         iss >> uID >> pos[0] >> pos[1] >> pos[2] >> radius;
+         WALBERLA_CHECK_GREATER(radius, 0_r, "Invalid radius of " << radius << " found in input file!")
+
+         diameter_SI += 2_r * radius;
+
+         numParticles++;
+      }
+      diameter_SI /= real_t(numParticles);
+
+      scalingFactor = real_c(domainSize[0]) / (generationDomainSize_SI[0] * real_c(bedCopiesInX));
+      avgDiameter   = diameter_SI * scalingFactor;
+
+      WALBERLA_CHECK_EQUAL(uint_c(scalingFactor * generationDomainSize_SI[1] * real_c(bedCopiesInY)), domainSize[1],
+                           "Error: Generated bed with copies and simulation domain do not match!")
+   }
+
+   walberla::mpi::broadcastObject(scalingFactor);
+   walberla::mpi::broadcastObject(avgDiameter);
+}
+
+void initSpheresFromFile(const std::string& filename, walberla::mesa_pd::data::ParticleStorage& ps,
+                         const walberla::mesa_pd::domain::IDomain& domain, walberla::real_t density,
+                         const Vector3< uint_t >& domainSize,
+                         const std::function< bool(walberla::Vector3< real_t >) >& particleCreateFunction,
+                         math::AABB simulationDomain, uint_t bedCopiesInX, uint_t bedCopiesInY, uint_t& numParticles,
+                         real_t& maxParticleHeight, const real_t& scalingFactor)
+{
+   using namespace walberla;
+   using namespace walberla::mesa_pd;
+   using namespace walberla::mesa_pd::data;
+
+   auto rank = walberla::mpi::MPIManager::instance()->rank();
+
+   std::string textFile;
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ifstream t(filename.c_str());
+      if (!t) { WALBERLA_ABORT("Invalid input file " << filename << "\n"); }
+      std::stringstream buffer;
+      buffer << t.rdbuf();
+      textFile = buffer.str();
+   }
+
+   walberla::mpi::broadcastObject(textFile);
+
+   std::istringstream fileIss(textFile);
+   std::string line;
+
+   // first line contains generation domain sizes
+   std::getline(fileIss, line);
+   Vector3< real_t > generationDomainSize_SI(0_r);
+   std::istringstream firstLine(line);
+   firstLine >> generationDomainSize_SI[0] >> generationDomainSize_SI[1] >> generationDomainSize_SI[2];
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(generationDomainSize_SI)
+
+   WALBERLA_CHECK_EQUAL(uint_c(scalingFactor * generationDomainSize_SI[0] * real_c(bedCopiesInX)), domainSize[0],
+                        "Error: Generated bed with copies and simulation domain do not match in x!")
+   WALBERLA_CHECK_EQUAL(uint_c(scalingFactor * generationDomainSize_SI[1] * real_c(bedCopiesInY)), domainSize[1],
+                        "Error: Generated bed with copies and simulation domain do not match in y!")
+
+   numParticles      = 0;
+   maxParticleHeight = 0_r;
+
+   while (std::getline(fileIss, line))
+   {
+      std::istringstream iss(line);
+
+      data::ParticleStorage::uid_type uID;
+      data::ParticleStorage::position_type pos;
+      walberla::real_t radius;
+      iss >> uID >> pos[0] >> pos[1] >> pos[2] >> radius;
+      radius *= scalingFactor;
+
+      for (uint_t copyInYDir = 0; copyInYDir < bedCopiesInY; ++copyInYDir)
+      {
+         for (uint_t copyInXDir = 0; copyInXDir < bedCopiesInX; ++copyInXDir)
+         {
+            auto particlePos = pos;
+
+            particlePos[0] += real_c(copyInXDir) * generationDomainSize_SI[0];
+            particlePos[1] += real_c(copyInYDir) * generationDomainSize_SI[1];
+
+            particlePos *= scalingFactor;
+
+            maxParticleHeight = std::max(maxParticleHeight, particlePos[2] + radius);
+
+            if (!particleCreateFunction(particlePos)) continue;
+
+            WALBERLA_CHECK(simulationDomain.contains(particlePos),
+                           "Particle read from file is not contained in simulation domain");
+
+            if (!domain.isContainedInProcessSubdomain(uint_c(rank), particlePos)) continue;
+
+            auto pIt = ps.create();
+            pIt->setPosition(particlePos);
+            pIt->getBaseShapeRef() = std::make_shared< data::Sphere >(radius);
+            pIt->getBaseShapeRef()->updateMassAndInertia(density);
+            pIt->setInteractionRadius(radius);
+            pIt->setOwner(rank);
+            pIt->setType(0);
+
+            numParticles++;
+         }
+      }
+
+      WALBERLA_CHECK_EQUAL(iss.tellg(), -1);
+   }
+   walberla::mpi::allReduceInplace(maxParticleHeight, walberla::mpi::MAX);
+   walberla::mpi::allReduceInplace(numParticles, walberla::mpi::SUM);
+}
+
+void getAverageVelocity(const mesa_pd::data::ParticleAccessorWithBaseShape& ac, real_t& averageVelocity,
+                        real_t& maxVelocity, uint_t& numParticles, real_t& maxHeight)
+{
+   averageVelocity = real_t(0);
+   maxVelocity     = real_t(0);
+   numParticles    = uint_t(0);
+   maxHeight       = real_t(0);
+   for (uint_t i = 0; i < ac.size(); ++i)
+   {
+      if (isSet(ac.getFlags(i), walberla::mesa_pd::data::particle_flags::GHOST)) continue;
+      if (isSet(ac.getFlags(i), walberla::mesa_pd::data::particle_flags::GLOBAL)) continue;
+
+      ++numParticles;
+      real_t velMagnitude = ac.getLinearVelocity(i).length();
+      averageVelocity += velMagnitude;
+      maxVelocity = std::max(maxVelocity, velMagnitude);
+      maxHeight   = std::max(maxHeight, ac.getPosition(i)[2]);
+   }
+
+   walberla::mpi::allReduceInplace(numParticles, walberla::mpi::SUM);
+   walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+   walberla::mpi::allReduceInplace(maxVelocity, walberla::mpi::MAX);
+   walberla::mpi::allReduceInplace(maxHeight, walberla::mpi::MAX);
+
+   averageVelocity /= real_t(numParticles);
+}
+
+auto createPlane(mesa_pd::data::ParticleStorage& ps, const mesa_pd::Vec3& pos, const mesa_pd::Vec3& normal)
+{
+   auto p0 = ps.create(true);
+   p0->setPosition(pos);
+   p0->setBaseShape(std::make_shared< mesa_pd::data::HalfSpace >(normal));
+   p0->getBaseShapeRef()->updateMassAndInertia(real_t(1));
+   p0->setOwner(walberla::mpi::MPIManager::instance()->rank());
+   p0->setType(0);
+   p0->setInteractionRadius(std::numeric_limits< real_t >::infinity());
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::GLOBAL);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+   mesa_pd::data::particle_flags::set(p0->getFlagsRef(), mesa_pd::data::particle_flags::NON_COMMUNICATING);
+   return p0;
+}
+
+} // namespace antidunes
+} // namespace walberla
diff --git a/apps/showcases/Antidunes/pyvista.ipynb b/apps/showcases/Antidunes/pyvista.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cb28649aca82ce6aa152f0b56a3a49552f6c00a
--- /dev/null
+++ b/apps/showcases/Antidunes/pyvista.ipynb
@@ -0,0 +1,827 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0953882f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyvista as pv\n",
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "import sys"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81395353",
+   "metadata": {},
+   "source": [
+    "## Plot initial setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddf1b644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fluid_file = \"/simdata/on74yces/2386423/vtk-out/fluid_field/simulation_step_0.vtu\"\n",
+    "particle_file = \"/simdata/on74yces/2386423/vtk-out/particles/simulation_step_0.vtu\"\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025 \n",
+    "dt = 1.38e-5\n",
+    "\n",
+    "fluid_reader = pv.get_reader(fluid_file)\n",
+    "fluid_mesh = fluid_reader.read()\n",
+    "print(\"fluid_mesh:\", fluid_mesh.array_names)\n",
+    "\n",
+    "particle_reader = pv.get_reader(particle_file)\n",
+    "particle_mesh = particle_reader.read()\n",
+    "print(\"particle_mesh:\", particle_mesh.array_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b7bb742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clip meshes\n",
+    "fluid_mesh = fluid_mesh.clip(normal='x', origin=(2700,59,41.5), invert=False)\n",
+    "particle_mesh = particle_mesh.clip(normal='x', origin=(2700,59,41.5), invert=False)\n",
+    "\n",
+    "# remove gas data from fluid_mesh\n",
+    "fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "# convert velocity to SI units\n",
+    "velocity_unit_conversion = dx / dt\n",
+    "velocity_si = fluid_mesh.get_array(\"velocity\") * velocity_unit_conversion\n",
+    "fluid_mesh.add_field_data(velocity_si, \"velocity_si\")\n",
+    "\n",
+    "# convert particle radius to diameter in SI units\n",
+    "diameter_si = particle_mesh.point_data[\"radius\"] * 2 * dx\n",
+    "particle_mesh.point_data[\"diameter_si\"] = diameter_si\n",
+    "\n",
+    "# create glyphs for particles\n",
+    "sphere_glyphs = particle_mesh.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "\n",
+    "# add box for outlining the domain\n",
+    "domain_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 160))\n",
+    "\n",
+    "# add box for outlining fixed particles\n",
+    "particles_fixed_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 17.3943))\n",
+    "\n",
+    "# add box for outlining the bed height\n",
+    "particles_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 48.0002))\n",
+    "\n",
+    "# add box for outlining the liquid height\n",
+    "liquid_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 82.3888))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8a6aff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pl = pv.Plotter(lighting='three lights')\n",
+    "pl.add_mesh(fluid_mesh, scalars=\"velocity_si\", component=0)\n",
+    "pl.add_mesh(sphere_glyphs, scalars=\"diameter_si\", cmap=\"Greys\")\n",
+    "pl.add_mesh(domain_box.outline(), line_width=3, color=\"black\")\n",
+    "pl.add_mesh(particles_fixed_box.outline(), line_width=1, color=\"black\")\n",
+    "pl.add_mesh(particles_box.outline(), line_width=1, color=\"black\")\n",
+    "pl.add_mesh(liquid_box.outline(), line_width=1, color=\"black\")\n",
+    "\n",
+    "pl.view_isometric()\n",
+    "pl.enable_parallel_projection()\n",
+    "#pl.camera.roll += 0\n",
+    "pl.camera.elevation -= 15\n",
+    "pl.camera.azimuth -= 90\n",
+    "pl.remove_scalar_bar(\"velocity_si\")\n",
+    "pl.remove_scalar_bar(\"diameter_si\")\n",
+    "pl.set_background('white')     \n",
+    "#pl.show_axes()\n",
+    "pl.camera.zoom(1.75)\n",
+    "#pl.screenshot(\"/home/rzlin/ca36xymo/setup.png\", transparent_background=False, window_size=(2560, 1440))\n",
+    "pl.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d1248ea",
+   "metadata": {},
+   "source": [
+    "## Plot velocity field (side view)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ca340e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fluid_file = \"/simdata/on74yces/2519434/vtk-out/fluid_field/simulation_step_4488000.vtu\"\n",
+    "particle_file = \"/simdata/on74yces/2519434/vtk-out/particles/simulation_step_4488000.vtu\"\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025 \n",
+    "dt = 1.38e-5\n",
+    "\n",
+    "fluid_reader = pv.get_reader(fluid_file)\n",
+    "fluid_mesh = fluid_reader.read()\n",
+    "print(\"fluid_mesh:\", fluid_mesh.array_names)\n",
+    "\n",
+    "particle_reader = pv.get_reader(particle_file)\n",
+    "particle_mesh = particle_reader.read()\n",
+    "print(\"particle_mesh:\", particle_mesh.array_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f41650a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clip meshes\n",
+    "fluid_mesh = fluid_mesh.clip(normal='x', origin=(2200,59,41.5), invert=False)\n",
+    "particle_mesh = particle_mesh.clip(normal='x', origin=(2200,59,41.5), invert=False)\n",
+    "fluid_mesh = fluid_mesh.clip(normal='x', origin=(3000,59,41.5), invert=True)\n",
+    "particle_mesh = particle_mesh.clip(normal='x', origin=(3000,59,41.5), invert=True)\n",
+    "\n",
+    "# remove gas data from fluid_mesh\n",
+    "fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "# convert velocity to SI units\n",
+    "velocity_unit_conversion = dx / dt\n",
+    "velocity_si = fluid_mesh.get_array(\"velocity\") * velocity_unit_conversion\n",
+    "fluid_mesh.add_field_data(velocity_si, \"velocity_si\")\n",
+    "\n",
+    "# convert particle radius to diameter in SI units\n",
+    "diameter_si = particle_mesh.point_data[\"radius\"] * 2 * dx\n",
+    "particle_mesh.point_data[\"diameter_si\"] = diameter_si\n",
+    "\n",
+    "# create glyphs for particles\n",
+    "sphere_glyphs = particle_mesh.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "\n",
+    "# # add box for outlining the domain\n",
+    "# domain_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 160))\n",
+    "\n",
+    "# # add box for outlining fixed particles\n",
+    "# particles_fixed_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 17.3943))\n",
+    "\n",
+    "# # add box for outlining the bed height\n",
+    "# particles_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 48.0002))\n",
+    "\n",
+    "# # add box for outlining the liquid height\n",
+    "# liquid_box = pv.Box(bounds=(2700, 3200, 0, 60, 0, 82.3888))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3647ceda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pl = pv.Plotter(lighting='three lights')\n",
+    "pl.add_mesh(fluid_mesh, scalars=\"velocity_si\", component=0)\n",
+    "pl.add_mesh(sphere_glyphs, scalars=\"diameter_si\", cmap=\"Greys\")\n",
+    "# pl.add_mesh(domain_box.outline(), line_width=3, color=\"black\")\n",
+    "# pl.add_mesh(particles_fixed_box.outline(), line_width=1, color=\"black\")\n",
+    "# pl.add_mesh(particles_box.outline(), line_width=1, color=\"black\")\n",
+    "# pl.add_mesh(liquid_box.outline(), line_width=1, color=\"black\")\n",
+    "\n",
+    "#pl.view_isometric()\n",
+    "pl.view_xz()\n",
+    "pl.enable_parallel_projection()\n",
+    "#pl.camera.roll += 0\n",
+    "#pl.camera.elevation -= 15\n",
+    "#pl.camera.azimuth -= 90\n",
+    "#pl.remove_scalar_bar(\"velocity_si\")\n",
+    "#pl.remove_scalar_bar(\"diameter_si\")\n",
+    "#pl.set_background('white')     \n",
+    "#pl.show_axes()\n",
+    "pl.camera.zoom(3.3)\n",
+    "#pl.screenshot(\"/simdata/ca36xymo/velocity-field.png\", transparent_background=True, window_size=(2560, 720))\n",
+    "pl.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c1f1df5",
+   "metadata": {},
+   "source": [
+    "## Plot velocity field (3D view) for graphical abstract"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "070962c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fluid_file = \"/simdata/on74yces/2593014/vtk-out/fluid_field/simulation_step_7550001.vtu\"\n",
+    "particle_file = \"/simdata/on74yces/2593014/vtk-out/particles/simulation_step_7550001.vtu\"\n",
+    "surface_file = \"/simdata/on74yces/2593014/mesh-out/simulation_step_0.obj\"\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025 \n",
+    "dt = 1.38e-5\n",
+    "\n",
+    "fluid_reader = pv.get_reader(fluid_file)\n",
+    "fluid_mesh = fluid_reader.read()\n",
+    "print(\"fluid_mesh:\", fluid_mesh.array_names)\n",
+    "\n",
+    "particle_reader = pv.get_reader(particle_file)\n",
+    "particle_mesh = particle_reader.read()\n",
+    "print(\"particle_mesh:\", particle_mesh.array_names)\n",
+    "\n",
+    "surface_reader = pv.get_reader(surface_file)\n",
+    "surface_mesh = surface_reader.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23597fc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove gas data from fluid_mesh\n",
+    "fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "# remove 5 cells in y-direction so that we can see the particles more clearly \n",
+    "fluid_mesh = fluid_mesh.clip(normal='y', origin=(2200,5,41.5), invert=False)\n",
+    "\n",
+    "# remove front part of the so that the figure starts from a low point (looks better)\n",
+    "# 2958 in particle field so that particle holes are not visible in the fluid mesh\n",
+    "fluid_mesh_back = fluid_mesh.clip(normal='x', origin=(2950,5,41.5), invert=True)\n",
+    "particle_mesh_back = particle_mesh.clip(normal='x', origin=(2958,59,41.5), invert=True) \n",
+    "\n",
+    "# store front part for appending it at the back (to maintain periodicity) \n",
+    "fluid_mesh_front = fluid_mesh.clip(normal='x', origin=(2950,5,41.5), invert=False)\n",
+    "particle_mesh_front = particle_mesh.clip(normal='x', origin=(2950,59,41.5), invert=False) # 2958 not necessary here\n",
+    "\n",
+    "# move front part to the back\n",
+    "transform_matrix = np.array([[1, 0, 0, -3200],\n",
+    "                             [0, 1, 0, 0],\n",
+    "                             [0, 0, 1, 0],\n",
+    "                             [0, 0, 0, 1]])\n",
+    "fluid_mesh_front = fluid_mesh_front.transform(transform_matrix)\n",
+    "particle_mesh_front = particle_mesh_front.transform(transform_matrix)\n",
+    "\n",
+    "# create glyphs for particles\n",
+    "sphere_glyphs_back = particle_mesh_back.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "sphere_glyphs_front = particle_mesh_front.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "\n",
+    "# convert velocity to SI units\n",
+    "velocity_unit_conversion = dx / dt\n",
+    "# IMPORTANT: \n",
+    "# The new velocity field (velocity_si) is not clipped if it is part of fluid_mesh.\n",
+    "# Therefore, we have to do the unit conversion for every clipped mesh individually.\n",
+    "fluid_mesh_back.add_field_data(fluid_mesh_back.get_array(\"velocity\") * velocity_unit_conversion, \"velocity_si\")\n",
+    "fluid_mesh_front.add_field_data(fluid_mesh_front.get_array(\"velocity\") * velocity_unit_conversion, \"velocity_si\")\n",
+    "\n",
+    "# get data range of the x-velocity from both fields for using it in the color bar\n",
+    "velocity_field_back_x_si = (fluid_mesh_back.field_data[\"velocity_si\"])[:,0]\n",
+    "velocity_field_front_x_si = (fluid_mesh_front.field_data[\"velocity_si\"])[:,0]\n",
+    "range_velocity_x_si = [min(np.min(velocity_field_back_x_si), np.min(velocity_field_front_x_si)),\n",
+    "                       max(np.max(velocity_field_back_x_si), np.max(velocity_field_front_x_si))]\n",
+    "\n",
+    "# convert particle radius to diameter in SI units\n",
+    "sphere_glyphs_back.point_data[\"diameter_si\"] = sphere_glyphs_back.point_data[\"radius\"] * 2 * dx\n",
+    "sphere_glyphs_front.point_data[\"diameter_si\"] = sphere_glyphs_front.point_data[\"radius\"] * 2 * dx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0a2b037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pl = pv.Plotter(lighting='three lights')\n",
+    "pl.add_mesh(fluid_mesh_back, scalars=\"velocity_si\", component=0, show_scalar_bar=False)\n",
+    "pl.add_mesh(fluid_mesh_front, scalars=\"velocity_si\", component=0, show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Downstream fluid velocity / m*s\", vertical=False, width=0.15, height=0.05,\n",
+    "                 position_x=0.7, position_y=0.6, color='black', title_font_size=40, label_font_size=25)\n",
+    "pl.update_scalar_bar_range(clim=[range_velocity_x_si[0],range_velocity_x_si[1]])\n",
+    "\n",
+    "pl.add_mesh(sphere_glyphs_back, scalars=\"diameter_si\", cmap=\"Greys\", show_scalar_bar=False)\n",
+    "pl.add_mesh(sphere_glyphs_front, scalars=\"diameter_si\", cmap=\"Greys\", show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Particle diameter / m\", vertical=False, width=0.15, height=0.05,\n",
+    "                  position_x=0.3, position_y=0.2, color='black', title_font_size=40, label_font_size=25, \n",
+    "                  n_labels=3, fmt=\"%.4f\")\n",
+    "pl.update_scalar_bar_range(clim=[0.0026,0.0032])\n",
+    "\n",
+    "pl.view_isometric()\n",
+    "#pl.enable_parallel_projection()\n",
+    "pl.camera.roll -= -5\n",
+    "pl.camera.elevation -= 35\n",
+    "pl.camera.azimuth -= 50\n",
+    "pl.set_background('white')     \n",
+    "#pl.show_axes()\n",
+    "pl.camera.zoom(7)\n",
+    "#pl.screenshot(\"/home/rzlin/ca36xymo/velocity-3d.png\", transparent_background=False, window_size=(2560, 1440))\n",
+    "pl.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adc126de",
+   "metadata": {},
+   "source": [
+    "## Plot full velocity field at each sampling point (for creating the animations)\n",
+    "\n",
+    "### Warning: there are severe bugs in pyvista that overcomplicate this script:\n",
+    "1. memory leak in pl.screenshot(); memory leak is supposed to be circumvented with pl.clear();\n",
+    "2. HOWEVER: when pl.clear() was called, pl.view_xz() somehow disrupts pl.camera.zoom() such that the latter command does not work anymore\n",
+    "\n",
+    "=> new strategy: call python from bash such that the kernel restarts for every file; this avoids the memory leak"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4b4d571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_image_array(pl, fluid_file_path, particle_file_path, dx, dt, color_bar_limits_velocity):\n",
+    "    fluid_reader = pv.get_reader(fluid_file_path)\n",
+    "    fluid_mesh = fluid_reader.read()\n",
+    "    \n",
+    "    # remove gas data from fluid_mesh\n",
+    "    fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "    particle_reader = pv.get_reader(particle_file_path)\n",
+    "    particle_mesh = particle_reader.read()\n",
+    "    \n",
+    "    # convert velocity to SI units\n",
+    "    velocity_unit_conversion = dx / dt\n",
+    "    velocity_si = fluid_mesh.get_array(\"velocity\") * velocity_unit_conversion\n",
+    "    fluid_mesh.add_field_data(velocity_si, \"velocity_si\")\n",
+    "\n",
+    "    # convert particle radius to diameter in SI units\n",
+    "    diameter_si = particle_mesh.point_data[\"radius\"] * 2 * dx\n",
+    "    particle_mesh.point_data[\"diameter_si\"] = diameter_si\n",
+    "\n",
+    "    # create glyphs for particles\n",
+    "    sphere_glyphs = particle_mesh.glyph(scale=\"radius\", factor=1, \n",
+    "                                        geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "\n",
+    "    \n",
+    "    pl.add_mesh(fluid_mesh, scalars=\"velocity_si\", component=0, show_scalar_bar=False)\n",
+    "    pl.add_scalar_bar(title=\"Downstream fluid velocity / m*s\", vertical=False, width=0.15, height=0.15,\n",
+    "                      position_x=0.325, position_y=0.8, color='black', title_font_size=70, label_font_size=50)\n",
+    "    pl.update_scalar_bar_range(clim=color_bar_limits_velocity)\n",
+    "    pl.add_mesh(sphere_glyphs, scalars=\"diameter_si\", cmap=\"Greys\", show_scalar_bar=False)\n",
+    "    pl.add_scalar_bar(title=\"Particle diameter / m\", vertical=False, width=0.15, height=0.15,\n",
+    "                      position_x=0.525, position_y=0.8, color='black', title_font_size=70, label_font_size=50)\n",
+    "    pl.update_scalar_bar_range(clim=[0.0026,0.0032])\n",
+    "    \n",
+    "    fluid_mesh.clear_field_data()\n",
+    "    particle_mesh.clear_field_data()\n",
+    "    sphere_glyphs.clear_field_data()\n",
+    "\n",
+    "    pl.view_xz()\n",
+    "    pl.enable_parallel_projection()\n",
+    "    #pl.remove_scalar_bar(\"velocity\")\n",
+    "    #pl.remove_scalar_bar(\"radius\")\n",
+    "    pl.set_background('white')\n",
+    "    pl.camera.zoom(10)\n",
+    "    image_array = pl.screenshot(transparent_background=False, window_size=(10000, 1000))\n",
+    "    \n",
+    "    # returns image as numpy array\n",
+    "    return image_array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17d29056",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from PIL import Image\n",
+    "import gc\n",
+    "\n",
+    "fluid_file_basepath = \"/simdata/on74yces/2386423/vtk-out/fluid_field\"\n",
+    "particle_file_basepath = \"/simdata/on74yces/2386423/vtk-out/particles\"\n",
+    "output_file_basepath = \"/simdata/ca36xymo/antidunes/test\"\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025 \n",
+    "dt = 1.38e-5 # E1\n",
+    "#dt = 1.08125e-5 # E4\n",
+    "\n",
+    "# empirically set after viewing data in ParaView (in lattice units)\n",
+    "color_bar_limits_velocity = np.array([-0.3, 1]) # E1 with e_dry=0.97\n",
+    "color_bar_limits_velocity = np.array([-0.3, 1.4]) # E4 with e_dry=0.97\n",
+    "\n",
+    "fluid_file_directory = os.fsencode(fluid_file_basepath)\n",
+    "\n",
+    "# create plotter outside of loop to avoid memoryleak in pyvista.screenshot()\n",
+    "# (https://github.com/pyvista/pyvista/issues/2252#issuecomment-1241929793)\n",
+    "pl = pv.Plotter(lighting='three lights')\n",
+    "\n",
+    "for file in os.listdir(fluid_file_directory):\n",
+    "    filename = os.fsdecode(file)\n",
+    "    \n",
+    "    fluid_file_path = fluid_file_basepath + '/' + filename\n",
+    "    particle_file_path = particle_file_basepath + '/' + filename\n",
+    "    \n",
+    "    filenumber = filename.split('_')\n",
+    "    filenumber = filenumber[2].split('.')\n",
+    "    filenumber = filenumber[0]\n",
+    "    \n",
+    "    output_file_path = output_file_basepath + '/' + filenumber + '.jpg'\n",
+    "    \n",
+    "    image_array = get_image_array(pl, fluid_file_path, particle_file_path, dx, dt, color_bar_limits_velocity)\n",
+    "\n",
+    "    pl.clear_actors()\n",
+    "\n",
+    "    image = Image.fromarray(image_array)\n",
+    "    \n",
+    "    image = image.crop((0, 0, image.size[0], 700)) # coordinate systems starts from top left\n",
+    "    image.save(output_file_path)\n",
+    "    image.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9a63e3b",
+   "metadata": {},
+   "source": [
+    "### Bash script solution to circumvent memory leak bug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5b66e66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this Python script (named create_image.py) must be called by the bash script\n",
+    "\n",
+    "#!/usr/bin/env python3\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "from PIL import Image\n",
+    "import pyvista as pv\n",
+    "import numpy as np\n",
+    "\n",
+    "pv.start_xvfb()\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025\n",
+    "dt = 1.38e-5 # E1\n",
+    "#dt = 1.08125e-5 # E4\n",
+    "\n",
+    "# empirically set after viewing data in ParaView (in lattice units)\n",
+    "color_bar_limits_velocity = np.array([-0.3, 1]) # E1 with e_dry=0.97\n",
+    "#color_bar_limits_velocity = np.array([-0.3, 1.4]) # E4 with e_dry=0.97\n",
+    "\n",
+    "# path to fluid file must be given as first command line argument\n",
+    "fluid_file_path = str(sys.argv[1])\n",
+    "particle_file_path = fluid_file_path.replace(\"fluid_field\", \"particles\")\n",
+    "\n",
+    "filenumber = fluid_file_path.split('_')[-1]\n",
+    "filenumber = filenumber.split('.')[0]\n",
+    "filenumber = str(filenumber).rjust(10, '0')\n",
+    "output_file_path = \"/simdata/ca36xymo/antidunes/animation-e1/\" + filenumber + \".jpg\"\n",
+    "\n",
+    "fluid_reader = pv.get_reader(fluid_file_path)\n",
+    "fluid_mesh = fluid_reader.read()\n",
+    "\n",
+    "particle_reader = pv.get_reader(particle_file_path)\n",
+    "particle_mesh = particle_reader.read()\n",
+    "\n",
+    "# remove gas data from fluid_mesh\n",
+    "fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "# convert velocity to SI units\n",
+    "velocity_unit_conversion = dx / dt\n",
+    "velocity_si = fluid_mesh.get_array(\"velocity\") * velocity_unit_conversion\n",
+    "fluid_mesh.add_field_data(velocity_si, \"velocity_si\")\n",
+    "\n",
+    "# convert particle radius to diameter in SI units\n",
+    "diameter_si = particle_mesh.point_data[\"radius\"] * 2 * dx\n",
+    "particle_mesh.point_data[\"diameter_si\"] = diameter_si\n",
+    "\n",
+    "# create glyphs for particles\n",
+    "sphere_glyphs = particle_mesh.glyph(scale=\"radius\", factor=1,\n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=150, phi_resolution=150))\n",
+    "\n",
+    "pl = pv.Plotter(lighting='three lights', off_screen=True)\n",
+    "\n",
+    "pl.add_mesh(fluid_mesh, scalars=\"velocity_si\", component=0, show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Downstream fluid velocity / m*s\", vertical=False, width=0.15, height=0.15,\n",
+    "                  position_x=0.325, position_y=0.8, color='black', title_font_size=70, label_font_size=50)\n",
+    "pl.update_scalar_bar_range(clim=color_bar_limits_velocity)\n",
+    "pl.add_mesh(sphere_glyphs, scalars=\"diameter_si\", cmap=\"Greys\", show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Particle diameter / m\", vertical=False, width=0.15, height=0.15,\n",
+    "                  position_x=0.525, position_y=0.8, color='black', title_font_size=70, label_font_size=50)\n",
+    "pl.update_scalar_bar_range(clim=[0.0026,0.0032])\n",
+    "\n",
+    "pl.view_xz()\n",
+    "#pl.enable_parallel_projection()\n",
+    "pl.set_background('white')\n",
+    "#pl.camera.zoom(10)\n",
+    "\n",
+    "# specify camera so that image stays at a fixed position\n",
+    "pl.camera.clipping_range = (5812.79, 6720.05)\n",
+    "pl.camera.distance = 6207.82\n",
+    "pl.camera.focal_point = (1600.44, 29.9889, 38.3558)\n",
+    "pl.camera.parallel_projection = True\n",
+    "pl.camera.parallel_scale = 160.67\n",
+    "pl.camera.thickness = 907.259\n",
+    "pl.camera.view_angle = 30\n",
+    "\n",
+    "pl.window_size = [10000, 1000]\n",
+    "pl.screenshot(output_file_path,transparent_background=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "537dc005",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/bin/bash -l\n",
+    "\n",
+    "conda activate\n",
+    "\n",
+    "fluid_file_basepath1=\"/simdata/on74yces/2502347/vtk-out/fluid_field/\"\n",
+    "# fluid_file_basepath2=\"/simdata/on74yces/2504475/vtk-out/fluid_field/\"\n",
+    "# fluid_file_basepath3=\"/simdata/on74yces/2509267/vtk-out/fluid_field/\"\n",
+    "# fluid_file_basepath4=\"/simdata/on74yces/2519434/vtk-out/fluid_field/\"\n",
+    "\n",
+    "declare -a PathList=($fluid_file_basepath1 $fluid_file_basepath2 $fluid_file_basepath3 $fluid_file_basepath4)\n",
+    "\n",
+    "for path in ${PathList[@]}; do\n",
+    "   for file in $path*.vtu; do\n",
+    "      python3 create-image.py $file\n",
+    "      # echo $file\n",
+    "   done\n",
+    "done"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebe75f76",
+   "metadata": {},
+   "source": [
+    "## Plot full bed height elevation at each sampling point (for creating the animations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9254f001",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# this Python script (named create_image.py) must be called by the bash script\n",
+    "\n",
+    "#!/usr/bin/env python3\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "from PIL import Image\n",
+    "import pyvista as pv\n",
+    "import numpy as np\n",
+    "\n",
+    "pv.start_xvfb()\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025\n",
+    "\n",
+    "# path to particle file must be given as first command line argument\n",
+    "particle_file_path = \"/simdata/on74yces/2509268/vtk-out/particles/simulation_step_3799000.vtu\" #str(sys.argv[1])\n",
+    "\n",
+    "filenumber = particle_file_path.split('_')[-1]\n",
+    "filenumber = filenumber.split('.')[0]\n",
+    "filenumber = str(filenumber).rjust(10, '0')\n",
+    "output_file_path = \"/simdata/ca36xymo/antidunes/animation-particles-e4/\" + filenumber + \".jpg\"\n",
+    "\n",
+    "particle_reader = pv.get_reader(particle_file_path)\n",
+    "particle_mesh = particle_reader.read()\n",
+    "\n",
+    "# add z-coordinate of particle center in SI units\n",
+    "particle_mesh.point_data[\"center_coordinate_z_si\"] = particle_mesh.points[:,2] * dx\n",
+    "\n",
+    "# create glyphs for particles\n",
+    "sphere_glyphs = particle_mesh.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=30, phi_resolution=30))\n",
+    "\n",
+    "pl = pv.Plotter(lighting='three lights', off_screen=True)\n",
+    "\n",
+    "# add mesh at periodic sides to enlarge bed region\n",
+    "transform_matrix = np.array([[1, 0, 0, 0],\n",
+    "                                     [0, 1, 0, -60],\n",
+    "                                     [0, 0, 1, 0],\n",
+    "                                     [0, 0, 0, 1]])\n",
+    "particle_mesh_transformed_once = particle_mesh.transform(transform_matrix,inplace=False)\n",
+    "sphere_glyphs_transformed_once = particle_mesh_transformed_once.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=30, phi_resolution=30))\n",
+    "particle_mesh_transformed_twice = particle_mesh_transformed_once.transform(transform_matrix,inplace=False)\n",
+    "sphere_glyphs_transformed_twice = particle_mesh_transformed_twice.glyph(scale=\"radius\", factor=1, \n",
+    "                                    geom=pv.Sphere(radius=1, theta_resolution=30, phi_resolution=30))\n",
+    "\n",
+    "pl.add_mesh(sphere_glyphs_transformed_once, scalars=\"center_coordinate_z_si\", show_scalar_bar=False)\n",
+    "pl.update_scalar_bar_range(clim=[0.008,0.012])\n",
+    "pl.add_mesh(sphere_glyphs, scalars=\"center_coordinate_z_si\", show_scalar_bar=False)\n",
+    "pl.update_scalar_bar_range(clim=[0.008,0.012])\n",
+    "pl.add_mesh(sphere_glyphs_transformed_twice, scalars=\"center_coordinate_z_si\", show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Bed height elevation / m\", vertical=False, width=0.15, height=0.15,\n",
+    "                  position_x=0.425, position_y=0.7, color='black', title_font_size=70, label_font_size=50,\n",
+    "                  fmt=\"%.3f\")\n",
+    "pl.update_scalar_bar_range(clim=[0.008,0.012])\n",
+    "\n",
+    "# pl.add_mesh(pv.Line((0,0,100), (3200,0,100)), color=\"lightgray\", line_width=10)\n",
+    "# pl.add_mesh(pv.Line((0,-60,100), (3200,-60,100)), color=\"lightgray\", line_width=10)\n",
+    "\n",
+    "pl.view_xy()\n",
+    "#pl.enable_parallel_projection()\n",
+    "pl.set_background('white')\n",
+    "#pl.camera.zoom(10)\n",
+    "\n",
+    "# specify camera so that image stays at a fixed position\n",
+    "pl.camera.clipping_range = (5812.79, 6720.05)\n",
+    "pl.camera.distance = 6207.82\n",
+    "pl.camera.focal_point = (1600.44, 29.9889, 38.3558)\n",
+    "pl.camera.parallel_projection = True\n",
+    "pl.camera.parallel_scale = 160.67\n",
+    "pl.camera.thickness = 907.259\n",
+    "pl.camera.view_angle = 30\n",
+    "\n",
+    "pl.window_size = [10000, 1000]\n",
+    "pl.screenshot(output_file_path,transparent_background=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "73d07351",
+   "metadata": {},
+   "source": [
+    "## Plot full free-surface height elevation at each sampling point (for creating the animations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95013d8f",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# this Python script (named create_image.py) must be called by the bash script\n",
+    "\n",
+    "#!/usr/bin/env python3\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "from PIL import Image\n",
+    "import pyvista as pv\n",
+    "import numpy as np\n",
+    "\n",
+    "pv.start_xvfb()\n",
+    "\n",
+    "# taken from job output file\n",
+    "dx = 0.00025\n",
+    "\n",
+    "# path to fluid file must be given as first command line argument\n",
+    "fluid_file_path = str(sys.argv[1]) #\"/simdata/on74yces/2517354/vtk-out/fluid_field/simulation_step_7551000.vtu\" \n",
+    "\n",
+    "filenumber = fluid_file_path.split('_')[-1]\n",
+    "filenumber = filenumber.split('.')[0]\n",
+    "filenumber = str(filenumber).rjust(10, '0')\n",
+    "output_file_path = \"/simdata/ca36xymo/antidunes/animation-free-surface-e4/\" + filenumber + \".jpg\"\n",
+    "\n",
+    "fluid_reader = pv.get_reader(fluid_file_path)\n",
+    "fluid_mesh = fluid_reader.read()\n",
+    "\n",
+    "# remove gas data from fluid_mesh\n",
+    "fluid_mesh = fluid_mesh.threshold(value=2, scalars=\"mapped_flag\", invert=True)\n",
+    "\n",
+    "# slice fluid mesh to get real 2D data\n",
+    "fluid_mesh = fluid_mesh.slice(normal=[0, 1, 0])\n",
+    "\n",
+    "# extract surface so we apply the extrusion\n",
+    "fluid_mesh = fluid_mesh.extract_surface()\n",
+    "\n",
+    "# extrude fluid_mesh to 3D\n",
+    "fluid_mesh = fluid_mesh.extrude([0, -60, 0], capping=False)\n",
+    "\n",
+    "# add z-coordinate of cell center in SI units\n",
+    "fluid_mesh.point_data[\"coordinate_z_si\"] = fluid_mesh.points[:,2] * dx\n",
+    "\n",
+    "pl = pv.Plotter(lighting='three lights', off_screen=True)\n",
+    "\n",
+    "pl.add_mesh(fluid_mesh, scalars=\"coordinate_z_si\", component=0, show_scalar_bar=False)\n",
+    "pl.add_scalar_bar(title=\"Free-surface elevation / m\", vertical=False, width=0.15, height=0.15,\n",
+    "                  position_x=0.425, position_y=0.7, color='black', title_font_size=70, label_font_size=50,\n",
+    "                  fmt=\"%.3f\")\n",
+    "#pl.update_scalar_bar_range(clim=[0.018,0.022]) # E1\n",
+    "pl.update_scalar_bar_range(clim=[0.020,0.024]) # E4\n",
+    "\n",
+    "pl.view_xy()\n",
+    "#pl.enable_parallel_projection()\n",
+    "pl.set_background('white')\n",
+    "#pl.camera.zoom(10)\n",
+    "\n",
+    "# specify camera so that image stays at a fixed position\n",
+    "pl.camera.clipping_range = (5812.79, 6720.05)\n",
+    "pl.camera.distance = 6207.82\n",
+    "pl.camera.focal_point = (1600.44, 29.9889, 38.3558)\n",
+    "pl.camera.parallel_projection = True\n",
+    "pl.camera.parallel_scale = 160.67\n",
+    "pl.camera.thickness = 907.259\n",
+    "pl.camera.view_angle = 30\n",
+    "\n",
+    "pl.window_size = [10000, 1000]\n",
+    "\n",
+    "pl.screenshot(output_file_path,transparent_background=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b516bfd5",
+   "metadata": {},
+   "source": [
+    "## Plot coordinate systems\n",
+    "This is very troublesome to do in the actual plots => add them afterwards using ffmpeg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e46cfd29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyvista as pv\n",
+    "\n",
+    "output_file_path = \"/simdata/ca36xymo/antidunes/animation-free-surface-e1/coordinate-system.jpg\"\n",
+    "\n",
+    "pl = pv.Plotter(lighting='three lights', off_screen=True)\n",
+    "pl.view_xz()\n",
+    "pl.add_axes(color=\"black\", x_color=\"black\", y_color=\"black\", z_color=\"black\", viewport=(0, 0, 1, 1))\n",
+    "pl.set_background('white')\n",
+    "\n",
+    "pl.screenshot(output_file_path,transparent_background=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63082336",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/apps/showcases/Antidunes/slice_evaluation.ipynb b/apps/showcases/Antidunes/slice_evaluation.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..422762f92b8eceb90364660bb43a1c07f777a96c
--- /dev/null
+++ b/apps/showcases/Antidunes/slice_evaluation.ipynb
@@ -0,0 +1,1138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f094327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib notebook\n",
+    "import matplotlib as mpl\n",
+    "import numpy as np\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "904ccd9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseFolder = \"/simdata/on74yces/merge_E1_flat_new\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7782605",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def readSliceData(fileName):\n",
+    "    with open(fileName) as f:\n",
+    "        line = f.readline()[2:].split(\" \")\n",
+    "        nx = int(line[0])\n",
+    "        nz = int(line[1])\n",
+    "    svf = np.loadtxt(fileName)\n",
+    "    return svf.reshape((nz,nx)).transpose() # x: dim0, z: dim1\n",
+    "\n",
+    "def extractMaxAvailableTimeStep():\n",
+    "    import glob\n",
+    "\n",
+    "    baseFileName = \"/svfSlice_\"\n",
+    "    fileEnding = \".txt\"\n",
+    "    allFiles = glob.glob(baseFolder + baseFileName+\"*\"+fileEnding)\n",
+    "    maxTimeStep = np.max([int(f[(f.find(baseFileName)+len(baseFileName)):f.find(fileEnding)]) for f in allFiles])\n",
+    "    return maxTimeStep\n",
+    "\n",
+    "def getCurrentForcing(t):\n",
+    "    fileName = baseFolder+\"/fluidInfo.txt\"\n",
+    "    # columns: t, fx, uMean, ...\n",
+    "    content = np.loadtxt(fileName).transpose()\n",
+    "    idx = np.argwhere(content[0] == t)\n",
+    "    return content[1][idx]\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "114b2285",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# source: \n",
+    "# https://stackoverflow.com/questions/26563858/matplotlib-imshow-fixed-aspect-and-vertical-colorbar-matching-master-axis-height\n",
+    "def match_colorbar(cb, ax=None):\n",
+    "    \"\"\"\n",
+    "    Match the size of the colorbar with the size of the axes.\n",
+    "    \n",
+    "    Args:\n",
+    "        ax: Axes from which the colorbar \"stole\" space.\n",
+    "        cb: Colorbar to match to `ax`.\n",
+    "    \"\"\"\n",
+    "    ax = ax or plt.gca()\n",
+    "    bbox = ax.get_position()\n",
+    "    cb_bbox = cb.ax.get_position()\n",
+    "    if cb.orientation == \"vertical\":\n",
+    "        # Update bottom and height.\n",
+    "        left = cb_bbox.xmin\n",
+    "        width = cb_bbox.width\n",
+    "        bottom = bbox.ymin\n",
+    "        height = bbox.height\n",
+    "    else:\n",
+    "        # Update left and width.\n",
+    "        left = bbox.xmin\n",
+    "        width = bbox.width\n",
+    "        bottom = cb_bbox.ymin\n",
+    "        height = cb_bbox.height\n",
+    "    pos = [left, bottom, width, height]\n",
+    "    cb.ax.set_position(pos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d487479",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evalTimeStep(t):\n",
+    "    \n",
+    "    bedHeightThreshold = 0.3\n",
+    "    \n",
+    "    timestep = str(t)\n",
+    "    fileName = baseFolder+\"/svfSlice_\"+timestep+\".txt\"\n",
+    "    svf = readSliceData(fileName)\n",
+    "    fileName = baseFolder+\"/fillSlice_\"+timestep+\".txt\"\n",
+    "    fill = readSliceData(fileName)\n",
+    "    fileName = baseFolder+\"/velXSlice_\"+timestep+\".txt\"\n",
+    "    vel = readSliceData(fileName)\n",
+    "    \n",
+    "    fx = getCurrentForcing(t)\n",
+    "    \n",
+    "    #print(\"Total water volume: \" , np.sum(fill))\n",
+    "    \n",
+    "    #plt.figure()\n",
+    "    #plt.imshow(svf<bedHeightThreshold, interpolation='none')\n",
+    "    #plt.imshow(vel, interpolation='none')\n",
+    "    #plt.colorbar()\n",
+    "    #plt.show()\n",
+    "    \n",
+    "    xlen = svf.shape[0]\n",
+    "    zlen = svf.shape[1]\n",
+    "\n",
+    "\n",
+    "    xPos = np.linspace(0.5, xlen - 0.5, num = xlen)\n",
+    "    zPos = np.linspace(0.5, zlen - 0.5, num = zlen)\n",
+    "\n",
+    "    waterHeightOverX = np.zeros(xlen)\n",
+    "    bedHeightOverX = np.zeros(xlen)\n",
+    "    froudeOverX = np.zeros(xlen)\n",
+    "    shieldsOverX = np.zeros(xlen)\n",
+    "    ReFricOverX = np.zeros(xlen)\n",
+    "    \n",
+    "    #plt.figure()\n",
+    "    \n",
+    "    avgUMean = 0\n",
+    "    avgWaterHeight = 0\n",
+    "\n",
+    "    for x in range(xlen):\n",
+    "        svfX = svf[x,:]\n",
+    "        fillX = fill[x,:]\n",
+    "        velX = vel[x,:]\n",
+    "\n",
+    "        waterElevation = np.sum(fillX)\n",
+    "        idxWaterElevation = int(waterElevation)\n",
+    "        \n",
+    "        idxBedHeight = zlen - np.argmax(svfX[::-1]>bedHeightThreshold) - 1\n",
+    "        bedHeight = zPos[idxBedHeight] + (bedHeightThreshold - svfX[idxBedHeight]) / (svfX[idxBedHeight+1] - svfX[idxBedHeight]) * 1\n",
+    "\n",
+    "        waterHeight = waterElevation - bedHeight\n",
+    "        \n",
+    "        #print(idxWaterElevation, idxBedHeight, svfX[idxBedHeight+1], svfX[idxBedHeight], svfX[idxBedHeight-1])\n",
+    "\n",
+    "        uMean = np.average(velX[idxBedHeight:idxWaterElevation+1])\n",
+    "        \n",
+    "        #velDerivative = np.gradient(velX)\n",
+    "        # velocity derivative is very sensitive to the exact location -> where to take it?\n",
+    "        # here: take average over some cells right above the bed\n",
+    "        #velDerivativeAtBed = np.average(velDerivative[idxBedHeight+1:idxBedHeight+5]) # TODO check this!!!!\n",
+    "        #wallShearStress = dynViscosity * velDerivativeAtBed \n",
+    "        # note: this does only get a small part of the total stress on the particle -> different approach necessary\n",
+    "        \n",
+    "        # integrate current forcing over water height to the position of the bed (see discussion with Bernhard)\n",
+    "        wallShearStress = waterHeight * fx\n",
+    "        \n",
+    "        frictionVelocity = np.sqrt(wallShearStress / fluidDensity)\n",
+    "\n",
+    "        waterHeightOverX[x] = waterHeight\n",
+    "        bedHeightOverX[x] = bedHeight\n",
+    "        froudeOverX[x] = uMean / np.sqrt(waterHeight * gravAcceleration)\n",
+    "        shieldsOverX[x] = wallShearStress / ((densityRatio - 1) * gravAcceleration * avgDiameter)\n",
+    "        ReFricOverX[x] = frictionVelocity * waterHeight / kinematicViscosity\n",
+    "        \n",
+    "        \n",
+    "    \n",
+    "        #plt.plot(velDerivative[idxBedHeight:idxWaterElevation-1])\n",
+    "        #plt.plot(velX[idxBedHeight:idxWaterElevation+1])\n",
+    "        \n",
+    "        avgWaterHeight += waterHeight\n",
+    "        avgUMean += uMean\n",
+    "        \n",
+    "    #plt.xlabel(\"height\")\n",
+    "    #plt.ylabel(\"velX\")\n",
+    "    #plt.show()\n",
+    "    \n",
+    "    avgWaterHeight /= float(xlen)\n",
+    "    avgUMean /= float(xlen)\n",
+    "    \n",
+    "    ReBulk = avgUMean * avgWaterHeight / kinematicViscosity\n",
+    "    \n",
+    "    print(\"Re_b = \", ReBulk)\n",
+    "        \n",
+    "    return (xPos,waterHeightOverX, bedHeightOverX, froudeOverX, shieldsOverX, ReFricOverX)\n",
+    "\n",
+    "\n",
+    "def plotResult(result):\n",
+    "    fig, axes = plt.subplots(nrows=3, ncols=2,sharex=True)\n",
+    "    \n",
+    "    labels = [\"$h_f$\", \"$h_b$\", \"$Fr$\", \"$Sh$\", \"$Re_\\\\tau$\"]\n",
+    "    for i, ax in enumerate(axes.flatten()):\n",
+    "        if i >= len(labels):\n",
+    "            break\n",
+    "        ax.plot(result[0], result[i+1],'-')\n",
+    "        ax.set_ylabel(labels[i])\n",
+    "        ax.set_xlabel(\"$x$\")\n",
+    "        \n",
+    "    fig.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eeb92b70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(baseFolder+'/info.txt') as f:\n",
+    "    lines = f.readlines()\n",
+    "    evalFrequency = int(lines[0])\n",
+    "    gravAcceleration = float(lines[1])\n",
+    "    dynViscosity =  float(lines[2])\n",
+    "    densityRatio = float(lines[3])\n",
+    "    avgDiameter = float(lines[4])\n",
+    "    xSize = int(lines[5])\n",
+    "    ySize = int(lines[6])\n",
+    "    zSize = int(lines[7])\n",
+    "    numParticles = int(lines[8])\n",
+    "\n",
+    "print(\"Eval infos:\")\n",
+    "print(evalFrequency, gravAcceleration, dynViscosity, densityRatio, avgDiameter,numParticles)\n",
+    "maxTimeStep = extractMaxAvailableTimeStep()\n",
+    "\n",
+    "averageVelocity = 0.02\n",
+    "fluidDensity = 1\n",
+    "kinematicViscosity = dynViscosity / fluidDensity\n",
+    "#timeStepSpacing = 50*evalFrequency\n",
+    "timeStepSpacing = 10*evalFrequency\n",
+    "evalTimeSteps = np.arange(0,maxTimeStep,timeStepSpacing)\n",
+    "\n",
+    "lengthRef = avgDiameter\n",
+    "timeRef = avgDiameter / averageVelocity\n",
+    "\n",
+    "print(\"Will evaluate\", len(evalTimeSteps), \"slices, from 0 to\", maxTimeStep, \"with spacing\", timeStepSpacing)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bdec6b4",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "hbOverTime = np.zeros((len(evalTimeSteps), xSize))\n",
+    "hfOverTime = np.zeros((len(evalTimeSteps), xSize))\n",
+    "for i,t in enumerate(evalTimeSteps):\n",
+    "    result = evalTimeStep(t)\n",
+    "    \n",
+    "    # uncomment if plot per time step required\n",
+    "    #plotResult(result)\n",
+    "    \n",
+    "    hfAvg = np.average(result[1])\n",
+    "    hbAvg = np.average(result[2])\n",
+    "    FrAvg = np.average(result[3])\n",
+    "    ShAvg = np.average(result[4])\n",
+    "    print(f\"{i}, t = {t}: hf = {hfAvg:.2f}, hb = {hbAvg:.2f}, Fr = {FrAvg:.3f}, Sh = {ShAvg:.3e}\")\n",
+    "    hfOverTime[i] = result[1]\n",
+    "    hbOverTime[i] = result[2]\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3587551b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot 2D x-t-Data evaluation in lattice units\n",
+    "\n",
+    "xPlotLimits = np.array([0,xSize]) / lengthRef\n",
+    "tPlotLimits = np.array([min(evalTimeSteps), max(evalTimeSteps)]) / timeRef\n",
+    "\n",
+    "fluidSurfaceOverTime = hbOverTime+hfOverTime\n",
+    "plt.figure()\n",
+    "plt.imshow(hbOverTime, interpolation='none', origin='lower',\n",
+    "           aspect='auto',extent=(*xPlotLimits,*tPlotLimits))\n",
+    "#plt.imshow(fluidSurfaceOverTime[:,:], interpolation='none', origin='lower',aspect='auto')\n",
+    "plt.colorbar()\n",
+    "plt.xlabel(r\"$x / D$\")\n",
+    "plt.ylabel(r\"$t / t_{ref}$\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccfa2b70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# values taken from job output file\n",
+    "dx = 0.00025\n",
+    "\n",
+    "# E1\n",
+    "dt = 1.38442e-05\n",
+    "\n",
+    "# E4\n",
+    "#dt = 1.08125e-5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1b25e53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot 2D x-t-Data evaluation in SI units\n",
+    "\n",
+    "xPlotLimits = np.array([0,xSize]) * dx\n",
+    "tPlotLimits = np.array([min(evalTimeSteps), max(evalTimeSteps)]) * dt\n",
+    "averageHb = np.average(hbOverTime) * dx\n",
+    "\n",
+    "#fluidSurfaceOverTime = (hbOverTime+hfOverTime) * 0.00025\n",
+    "plt.figure()\n",
+    "plt.imshow(hbOverTime[:] * dx - averageHb, interpolation='none', origin='lower',\n",
+    "           aspect='auto', extent=(*xPlotLimits,*tPlotLimits), vmin=-5e-3, vmax=5e-3)\n",
+    "#plt.imshow(fluidSurfaceOverTime[:,:], interpolation='none', origin='lower',aspect='auto')\n",
+    "plt.colorbar()\n",
+    "plt.xlabel(r\"$x$ / m\")\n",
+    "plt.ylabel(r\"$t$ / s\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(tPlotLimits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4223c1cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot bed height evaluation in SI units with\n",
+    "    # time restricted to 30 - 75 s\n",
+    "    # x-domain restricted to 0 - 0.75 m\n",
+    "\n",
+    "timestep_length = evalTimeSteps[1] - evalTimeSteps[0]\n",
+    "\n",
+    "t_min = 30\n",
+    "timestep_index_min = int((t_min / dt) // timestep_length)\n",
+    "\n",
+    "t_max = 75\n",
+    "timestep_index_max = int((t_max / dt) // timestep_length)\n",
+    "\n",
+    "x_min = 0\n",
+    "x_index_min = int(x_min // dx)\n",
+    "\n",
+    "x_max = 0.75\n",
+    "x_index_max = int(x_max // dx)\n",
+    "\n",
+    "xPlotLimits = np.array([x_min,x_max])\n",
+    "tPlotLimits = np.array([t_min, t_max])\n",
+    "averageHb = np.average(hbOverTime[timestep_index_min:timestep_index_max,x_index_min:x_index_max]) * dx\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.imshow(hbOverTime[timestep_index_min:timestep_index_max,x_index_min:x_index_max] * dx - averageHb, interpolation='none', origin='lower',\n",
+    "           aspect='auto', extent=(*xPlotLimits,*tPlotLimits), vmin=-5e-3, vmax=5e-3)\n",
+    "\n",
+    "cb = plt.colorbar(label=r\"$h$ / m\")\n",
+    "match_colorbar(cb)\n",
+    "#cb.ax.tick_params(labelsize=8) \n",
+    "plt.xlabel(r\"$x$ / m\")\n",
+    "plt.ylabel(r\"$t$ / s\")\n",
+    "\n",
+    "import tikzplotlib\n",
+    "tikzplotlib.clean_figure()\n",
+    "tikzplotlib.save(\n",
+    "    '/home/rzlin/ca36xymo/tikz/bed-elevation-simulation.tex',\n",
+    "    axis_height = '\\\\figureheight',\n",
+    "    axis_width = '\\\\figurewidth'\n",
+    "    )\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bed63b06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(np.average(hfOverTime[timestep_index_min:timestep_index_max,x_index_min:x_index_max]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8c8fc0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.fft\n",
+    "\n",
+    "# plot PSD with\n",
+    "    # time restricted to 30 - 75 s\n",
+    "    # x-domain restricted to 0 - 0.75 m\n",
+    "\n",
+    "timestep_length = evalTimeSteps[1] - evalTimeSteps[0]\n",
+    "\n",
+    "t_min = 30\n",
+    "timestep_index_min = int((t_min / dt) // timestep_length)\n",
+    "\n",
+    "t_max = 75\n",
+    "timestep_index_max = int((t_max / dt) // timestep_length)\n",
+    "\n",
+    "x_min = 0\n",
+    "x_index_min = int(x_min // dx)\n",
+    "\n",
+    "x_max = 0.75\n",
+    "x_index_max = int(x_max // dx)\n",
+    "\n",
+    "averageHb = np.average(hbOverTime[timestep_index_min:timestep_index_max,x_index_min:x_index_max]) * dx\n",
+    "\n",
+    "hbOverTimeShort = hbOverTime[timestep_index_min:timestep_index_max,x_index_min:x_index_max] * dx - averageHb\n",
+    "\n",
+    "# power spectral density (PSD) on period-wavelength (PW) plane\n",
+    "psd_pw = scipy.fft.fft2(hbOverTimeShort)\n",
+    "psd_pw = np.abs(psd_pw)**2 / (hbOverTimeShort.shape[0]*hbOverTimeShort.shape[1])\n",
+    "\n",
+    "ft_period = np.abs(1 / scipy.fft.fftfreq(np.transpose(hbOverTimeShort).shape[-1],timestep_length)[1:] * dt)\n",
+    "ft_wavelength = np.abs(1 / scipy.fft.fftfreq(hbOverTimeShort.shape[-1],1)[1:] * dx)\n",
+    "\n",
+    "plt.figure()\n",
+    "cs = plt.contour(ft_period, ft_wavelength, np.transpose(np.abs(psd_pw[1:,1:])), levels=[0.01], colors='tab:orange',linewidths=6)\n",
+    "plt.xlabel(r\"$T$ / s\")\n",
+    "plt.ylabel(r\"$\\lambda$ / m\")\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([5, 70])\n",
+    "ax.set_ylim([0.05, 0.2])\n",
+    "#cb = plt.colorbar(label=r\"PSD / m$^2$\")\n",
+    "\n",
+    "\n",
+    "## uncomment the following things to only store the png (no axes etc.)\n",
+    "## => tikz image must be created manually later\n",
+    "# plt.subplots_adjust(bottom = 0)\n",
+    "# plt.subplots_adjust(top = 1)\n",
+    "# plt.subplots_adjust(right = 1)\n",
+    "# plt.subplots_adjust(left = 0)\n",
+    "# plt.axis('off')\n",
+    "# plt.savefig('/home/rzlin/ca36xymo/tikz/spectral-density-simulation.png',bbox_inches='tight',transparent=True, pad_inches=0)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afd23ef9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "psd_pw_new = np.transpose(np.abs(psd_pw[1:,1:]))\n",
+    "\n",
+    "# compute celerity (for each point in wavelength-period PSD-array)\n",
+    "# => identify all possible y-axis values for new celerity-wavelength PSD-array\n",
+    "c = []\n",
+    "x = 0\n",
+    "for wavelength in ft_wavelength:\n",
+    "    y = 0\n",
+    "    for period in ft_period:\n",
+    "        if psd_pw_new[x,y] > 0.005:\n",
+    "            c.append(wavelength / period)\n",
+    "        y += 1\n",
+    "    x += 1\n",
+    "        \n",
+    "c = list(set(c))    # remove duplicate entries from list\n",
+    "c.sort()            # sort in ascending order\n",
+    "# list 'c' contains all possible celerities in ascending order without duplicates\n",
+    "# => this will be the x-axis of the celerity-wavelength PSD-array \n",
+    "\n",
+    "# create celerity-wavelength PSD-array \n",
+    "psd_cw = np.zeros((ft_wavelength.shape[0], len(c)))\n",
+    "\n",
+    "# fill this array with PSD at correct position\n",
+    "for i_wavelength in range(0, ft_wavelength.shape[0]):\n",
+    "    for i_period in range(0, ft_period.shape[0]):\n",
+    "        if psd_pw_new[i_wavelength,i_period] > 0.005:\n",
+    "            celerity = ft_wavelength[i_wavelength] / ft_period[i_period] # compute celerity\n",
+    "            i_celerity = c.index(celerity) # map celerity to it's index on the x-axis of the new array\n",
+    "            psd_cw[i_wavelength,i_celerity] = abs(psd_pw_new[i_wavelength,i_period]) # store PSD at correct position in new array\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.contourf(ft_wavelength, np.asarray(c), np.abs(np.transpose(psd_cw)),colors='tab:orange')\n",
+    "#plt.colorbar()\n",
+    "plt.xlabel(r\"$\\lambda$ / m\")\n",
+    "plt.ylabel(r\"$c$ / m/s\")\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([0.05, 0.2])\n",
+    "ax.set_ylim([0, 0.015])\n",
+    "\n",
+    "# # uncomment the following things to only store the png (no axes etc.)\n",
+    "# # => tikz image must be created manually later\n",
+    "plt.subplots_adjust(bottom = 0)\n",
+    "plt.subplots_adjust(top = 1)\n",
+    "plt.subplots_adjust(right = 1)\n",
+    "plt.subplots_adjust(left = 0)\n",
+    "plt.axis('off')\n",
+    "plt.savefig('/home/rzlin/ca36xymo/tikz/celerity-simulation-e1.png',bbox_inches='tight',transparent=True, pad_inches=0)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8146b19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FFT along x per time frame -> find wavelength\n",
+    "\n",
+    "import scipy.fft\n",
+    "\n",
+    "N = hbOverTime.shape[-1] # = number of samples, here xSize\n",
+    "colors = sns.color_palette(\"crest\",hbOverTime.shape[0] )\n",
+    "plt.figure()\n",
+    "\n",
+    "for i,t in enumerate(evalTimeSteps):\n",
+    "\n",
+    "    yf = scipy.fft.rfft(hbOverTime[i])\n",
+    "    sampleSpacing = 1 # = dx\n",
+    "    \n",
+    "    # discard first element (is 0 frequency -> offset of sinus)\n",
+    "    xf = scipy.fft.rfftfreq(N,sampleSpacing)[1:]\n",
+    "    yfMod = (2.0 / N * np.abs(yf))[1:]\n",
+    "    plt.plot(1/xf, yfMod, color=colors[i])\n",
+    "\n",
+    "    print(t, \"Dominant wavelength / D = \", (1/xf[np.argmax(yfMod)])/avgDiameter )\n",
+    "\n",
+    "plt.xlabel(\"wave length (LU)\")\n",
+    "plt.ylabel(\"amplitude (LU)\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c441c91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FFT along y per dx -> find period length (same as above with transposed array)\n",
+    "\n",
+    "import scipy.fft\n",
+    "\n",
+    "averageHb = np.average(hbOverTime) * dx\n",
+    "# use only data from some later time step to avoid including the influence of the initial condition\n",
+    "hbOverTimeShort = hbOverTime[:250,:] * dx - averageHb\n",
+    "\n",
+    "hbOverTimeTransposed = np.transpose(hbOverTimeShort)\n",
+    "\n",
+    "N = hbOverTimeTransposed.shape[-1] # = number of samples, here xSize\n",
+    "colors = sns.color_palette(\"crest\", hbOverTimeTransposed.shape[0] )\n",
+    "plt.figure()\n",
+    "\n",
+    "for i,t in enumerate(np.arange(0,hbOverTime.shape[-1])):\n",
+    "\n",
+    "    yf = scipy.fft.rfft(hbOverTimeTransposed[i])\n",
+    "    sampleSpacing = evalTimeSteps[1] - evalTimeSteps[0]\n",
+    "    \n",
+    "    # discard first element (is 0 frequency -> offset of sinus)\n",
+    "    xf = scipy.fft.rfftfreq(N,sampleSpacing)[1:]\n",
+    "    yfMod = (2.0 / N * np.abs(yf))[1:]\n",
+    "    plt.plot(1/xf, yfMod, color=colors[i])\n",
+    "\n",
+    "    print(t, \"Dominant period length / s = \", (1/xf[np.argmax(yfMod)]) * dt )\n",
+    "\n",
+    "plt.xlabel(\"Period length T / s\")\n",
+    "plt.ylabel(\"amplitude (LU)\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19469336",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # transport rate\n",
+    "\n",
+    "# def readValueOverTimeFromFile(fileName, columnIndex):\n",
+    "#     return np.transpose(np.loadtxt(fileName, usecols=(0,columnIndex)))\n",
+    "\n",
+    "# sedimentTransportRateData = readValueOverTimeFromFile(baseFolder+'/bedload.txt',1)\n",
+    "\n",
+    "\n",
+    "# transportRateRef = avgDiameter * averageVelocity # = l_ref^2 / t_ref,  m^2 / s\n",
+    "\n",
+    "# plt.figure()\n",
+    "# plt.plot(sedimentTransportRateData[0] / timeRef, sedimentTransportRateData[1] / transportRateRef, '-',label=\"simulation\")\n",
+    "\n",
+    "# plt.axhline(2.0e-5 / (2.9e-3 * 0.37), linestyle='--', color='k', label=\"E1 (experiment)\")\n",
+    "# plt.axhline(6.1e-5 / (2.9e-3 * 0.46), linestyle='-.', color='k', label=\"E4 (experiment)\")\n",
+    "# plt.xlabel(r\"$t / t_{ref}$\")\n",
+    "# plt.ylabel(r\"$q_s / q_{ref}$\")\n",
+    "# plt.grid()\n",
+    "# plt.legend()\n",
+    "# plt.show()\n",
+    "\n",
+    "# # import tikzplotlib\n",
+    "# # tikzplotlib.clean_figure()\n",
+    "# # tikzplotlib.save(\n",
+    "# #     'bedload-transport-rate.tex',\n",
+    "# #     axis_height = '\\\\figureheight',\n",
+    "# #     axis_width = '\\\\figurewidth'\n",
+    "# #     )\n",
+    "\n",
+    "# # in Pascal Paper:\n",
+    "# # q_s,out approx q_s,in (Fig. 5)\n",
+    "# # -> take values from Tab. 1 as reference\n",
+    "# # E1: 2.0e-5 , E4: 6.1e-5 m^2/s\n",
+    "# # normalize by averageDiameter ( = 2.9mm) * averageVelocity (U)\n",
+    "# # -> E1: q_normalized = 0.0186, E4: q_normalized = 0.0457"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b43b04f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transport rate in SI units\n",
+    "\n",
+    "def readValueOverTimeFromFile(fileName, columnIndex):\n",
+    "    return np.transpose(np.loadtxt(fileName, usecols=(0,columnIndex)))\n",
+    "\n",
+    "sedimentTransportRateData = readValueOverTimeFromFile(baseFolder+'/bedload.txt',1)\n",
+    "\n",
+    "timestep_length = sedimentTransportRateData[0,1] - sedimentTransportRateData[0,0]\n",
+    "\n",
+    "t_min = 30\n",
+    "timestep_index_min = int((t_min / dt) // timestep_length)\n",
+    "\n",
+    "t_max = 75\n",
+    "timestep_index_max = int((t_max / dt) // timestep_length)\n",
+    "\n",
+    "plt.figure()\n",
+    "\n",
+    "## E1\n",
+    "#filepath = \"/simdata/on74yces/experiment-bedload-rate/qs_E1.csv\"\n",
+    "#exp_mean = np.array([[t_min, t_min + 10, t_max], [2.0e-5, 2.0e-5, 2.0e-5]])\n",
+    "\n",
+    "# E4\n",
+    "filepath = \"/simdata/on74yces/experiment-bedload-rate/qs_E4.csv\"\n",
+    "exp_mean = np.array([[t_min, t_min + 10, t_max], [6.1e-5, 6.1e-5, 6.1e-5]])\n",
+    "\n",
+    "data = np.transpose(np.loadtxt(filepath, delimiter=';',skiprows=1))\n",
+    "plt.plot(data[0,int(t_min//0.2):int(t_max//0.2)], data[1,int(t_min//0.2):int(t_max//0.2)], '-',label=\"Experiment\")\n",
+    "#plt.plot(exp_mean[0], exp_mean[1], linestyle='--', color='black', label=\"Experiment (mean)\")\n",
+    "\n",
+    "plt.plot(sedimentTransportRateData[0,timestep_index_min:timestep_index_max] * dt, \n",
+    "         sedimentTransportRateData[1,timestep_index_min:timestep_index_max] * dx**2 / dt, '-',label=\"Simulation\")\n",
+    "mean = np.mean(sedimentTransportRateData[1,timestep_index_min:timestep_index_max] * dx**2 / dt)\n",
+    "mean_array = np.array([[t_min, t_min + 10, t_max], [mean, mean, mean]])\n",
+    "#plt.plot(mean_array[0], mean_array[1], linestyle='-', color='black', label=\"Simulation (mean)\")\n",
+    "\n",
+    "plt.xlabel(r\"$t$ / s\")\n",
+    "plt.ylabel(r\"$q_s$ / (m$^{2}$/s)\")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([t_min, t_max])\n",
+    "\n",
+    "plt.grid()\n",
+    "#plt.legend()\n",
+    "plt.show()\n",
+    "\n",
+    "print(np.mean(sedimentTransportRateData[1,timestep_index_min:timestep_index_max] * dx**2 / dt))\n",
+    "\n",
+    "# For tikzplotlib export:\n",
+    "# - remove legend\n",
+    "# - remove mean plots\n",
+    "import tikzplotlib\n",
+    "tikzplotlib.clean_figure()\n",
+    "tikzplotlib.save(\n",
+    "    '/home/rzlin/ca36xymo/tikz/bedload-rate.tex',\n",
+    "    axis_height = '\\\\figureheight',\n",
+    "    axis_width = '\\\\figurewidth'\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40c316a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute median particle diameter\n",
+    "\n",
+    "import numpy as  np\n",
+    "\n",
+    "filepath = \"/simdata/on74yces/2395774/spheres_out.dat\"\n",
+    "\n",
+    "particles = np.loadtxt(filepath, delimiter=' ', skiprows=1)\n",
+    "\n",
+    "# 50th percentile diameter\n",
+    "diameterMedian = np.median(particles[:,4]*2)\n",
+    "print(\"50th percentile, i.e., median particle diameter: \", diameterMedian, \"m\")\n",
+    "print(\"50th percentile, i.e., median particle diameter: \", diameterMedian / dx, \"cells\")\n",
+    "\n",
+    "# 16th percentile diameter\n",
+    "diameter16th = np.percentile(particles[:,4]*2, 16)\n",
+    "print(\"16th percentile particle diameter: \", diameter16th, \"m\")\n",
+    "print(\"16th percentile particle diameter: \", diameter16th / dx, \"cells\")\n",
+    "\n",
+    "# 84th percentile diameter\n",
+    "diameter84th = np.percentile(particles[:,4]*2, 84)\n",
+    "print(\"84th percentile particle diameter: \", diameter84th, \"m\")\n",
+    "print(\"84th percentile particle diameter: \", diameter84th / dx, \"cells\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f7f03b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot data from experiment of Pascal et al.\n",
+    "import math\n",
+    "\n",
+    "filepath = \"/simdata/on74yces/experiment-bed-elevation/bed_surface_in_time_E4.csv\"\n",
+    "data = np.loadtxt(filepath, delimiter=';')\n",
+    "\n",
+    "# crop data to relevant time steps\n",
+    "data = data[1200:1450,:]\n",
+    "\n",
+    "# remove inclination from data\n",
+    "sine3degree = math.sin(3.0*math.pi/180)\n",
+    "for x in range(0, data.shape[1]):\n",
+    "    data[:,x] = data[:,x] - abs(x-1280)*0.75/data.shape[1]*sine3degree\n",
+    "\n",
+    "xPlotLimitsN = np.array([0, data.shape[1]]) * 0.75/data.shape[1]\n",
+    "tPlotLimitsN = np.array([1200, 1200+data.shape[0]])\n",
+    "averageHbN = np.average(data)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.imshow(data - averageHbN, interpolation='none', origin='lower',\n",
+    "           aspect='auto', extent=(*xPlotLimitsN,*tPlotLimitsN), vmin=-5e-3, vmax=5e-3)\n",
+    "plt.colorbar()\n",
+    "plt.xlabel(r\"$x$ / m\")\n",
+    "plt.ylabel(r\"$t$ / s\")\n",
+    "plt.show()\n",
+    "\n",
+    "print(data.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45779e36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "# plot bed height evaluation of experiment\n",
+    "    # time restricted to 1200 - 1245 s\n",
+    "    # x-domain restricted to 0 - 0.75 m\n",
+    "    \n",
+    "filepath = \"/simdata/on74yces/experiment-bed-elevation/bed_surface_in_time_E1.csv\"\n",
+    "hbOverTimeExp = np.loadtxt(filepath, delimiter=';')\n",
+    "\n",
+    "dxExp = 0.75 / 1280\n",
+    "\n",
+    "# remove inclination from data\n",
+    "sine3degree = math.sin(3*math.pi/180)\n",
+    "for x in range(0, hbOverTimeExp.shape[1]):\n",
+    "    hbOverTimeExp[:,x] = hbOverTimeExp[:,x] - abs(x-1280)*0.75/hbOverTimeExp.shape[1]*sine3degree\n",
+    "\n",
+    "t_min = 1200\n",
+    "timestep_index_min = t_min\n",
+    "\n",
+    "t_max = 1245\n",
+    "timestep_index_max = t_max\n",
+    "\n",
+    "x_min = 0\n",
+    "x_index_min = int(x_min // dxExp)\n",
+    "\n",
+    "x_max = 0.75\n",
+    "x_index_max = int(x_max // dxExp)\n",
+    "\n",
+    "xPlotLimits = np.array([x_min,x_max])\n",
+    "tPlotLimits = np.array([t_min, t_max])\n",
+    "averageHbExp = np.average(hbOverTimeExp)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.imshow(hbOverTimeExp[timestep_index_min:timestep_index_max,x_index_min:x_index_max] - averageHbExp, \n",
+    "           interpolation='none', origin='lower', aspect='auto', extent=(*xPlotLimits,*tPlotLimits), \n",
+    "           vmin=-5e-3, vmax=5e-3)\n",
+    "\n",
+    "cb = plt.colorbar(label=r\"$h$ / m\")\n",
+    "#match_colorbar(cb)\n",
+    "#cb.ax.tick_params(labelsize=8) \n",
+    "plt.xlabel(r\"$x$ / m\")\n",
+    "plt.ylabel(r\"$t$ / s\")\n",
+    "\n",
+    "import tikzplotlib\n",
+    "tikzplotlib.clean_figure()\n",
+    "tikzplotlib.save(\n",
+    "    '/home/rzlin/ca36xymo/tikz/bed-elevation-experiment.tex',\n",
+    "    axis_height = '\\\\figureheight',\n",
+    "    axis_width = '\\\\figurewidth'\n",
+    "    )\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dec9d4b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy.fft\n",
+    "\n",
+    "# plot spectral density of bed height in experiment\n",
+    "\n",
+    "filepath = \"/simdata/on74yces/experiment-bed-elevation/bed_surface_in_time_E1.csv\"\n",
+    "hbOverTimeExp = np.loadtxt(filepath, delimiter=';')\n",
+    "\n",
+    "dxExp = 0.75 / 1280\n",
+    "\n",
+    "# remove inclination from data\n",
+    "sine3degree = math.sin(2.9*math.pi/180)\n",
+    "for x in range(0, hbOverTimeExp.shape[1]):\n",
+    "    hbOverTimeExp[:,x] = hbOverTimeExp[:,x] - abs(x-1280)*0.75/hbOverTimeExp.shape[1]*sine3degree\n",
+    "\n",
+    "averageHbExp = np.average(hbOverTimeExp)\n",
+    "\n",
+    "# power spectral density (PSD) on period-wavelength (PW) plane\n",
+    "psd_pw_exp = scipy.fft.fft2(hbOverTimeExp - averageHbExp)\n",
+    "psd_pw_exp = np.abs(psd_pw_exp)**2 / (hbOverTimeExp.shape[0] * hbOverTimeExp.shape[1])\n",
+    "\n",
+    "ft_period_exp = np.abs(1 / scipy.fft.fftfreq(np.transpose(hbOverTimeExp).shape[-1],1)[1:])\n",
+    "ft_wavelength_exp = np.abs(1 / scipy.fft.fftfreq(hbOverTimeExp.shape[-1],0.75/1280)[1:])\n",
+    "\n",
+    "plt.figure()\n",
+    "\n",
+    "plt.contourf(ft_period_exp, ft_wavelength_exp, np.transpose(np.abs(psd_pw_exp[1:,1:])))\n",
+    "#cb = plt.colorbar(label=r\"PSD / m$^2$\")\n",
+    "#match_colorbar(cb)\n",
+    "plt.xlabel(r\"$T$ / s\")\n",
+    "plt.ylabel(r\"$\\lambda$ / m\")\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([5, 70])\n",
+    "ax.set_ylim([0.05, 0.2])\n",
+    "\n",
+    "# uncomment the following things to only store the png (no axes etc.)\n",
+    "# => tikz image must be created manually later\n",
+    "plt.subplots_adjust(bottom = 0)\n",
+    "plt.subplots_adjust(top = 1)\n",
+    "plt.subplots_adjust(right = 1)\n",
+    "plt.subplots_adjust(left = 0)\n",
+    "plt.axis('off')\n",
+    "plt.savefig('/home/rzlin/ca36xymo/tikz/spectral-density-experiment.png',bbox_inches='tight',transparent=True, pad_inches=0)\n",
+    "\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf86063d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "psd_pw_exp_new = np.transpose(np.abs(psd_pw_exp[1:,1:]))\n",
+    "\n",
+    "# compute celerity (for each point in wavelength-period PSD-array)\n",
+    "# => identify all possible y-axis values for new celerity-wavelength PSD-array\n",
+    "c_exp = []\n",
+    "x = 0\n",
+    "for wavelength in ft_wavelength_exp:\n",
+    "    y = 0\n",
+    "    for period in ft_period_exp:\n",
+    "        if psd_pw_exp_new[x,y] > 0.0001:\n",
+    "            c_exp.append(wavelength / period)\n",
+    "        y += 1\n",
+    "    x += 1\n",
+    "\n",
+    "c_exp = list(set(c_exp))    # remove duplicate entries from list\n",
+    "c_exp.sort()                # sort in ascending order\n",
+    "# list 'c' contains all possible celerities in ascending order without duplicates\n",
+    "# => this will be the x-axis of the celerity-wavelength PSD-array\n",
+    "\n",
+    "# create celerity-wavelength PSD-array \n",
+    "psd_cw = np.zeros((ft_wavelength_exp.shape[0], len(c_exp)))\n",
+    "\n",
+    "# fill this array with PSD at correct position\n",
+    "for i_wavelength in range(0, ft_wavelength_exp.shape[0]):\n",
+    "    for i_period in range(0, ft_period_exp.shape[0]):\n",
+    "        if psd_pw_exp_new[i_wavelength,i_period] > 0.0001:\n",
+    "            celerity = ft_wavelength_exp[i_wavelength] / ft_period_exp[i_period] # compute celerity\n",
+    "            i_celerity = c_exp.index(celerity) # map celerity to it's index on the x-axis of the new array\n",
+    "            psd_cw[i_wavelength,i_celerity] = abs(psd_pw_exp_new[i_wavelength,i_period]) # store PSD at correct position in new array\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.contourf(ft_wavelength_exp, np.asarray(c_exp), np.abs(np.transpose(psd_cw)),colors='tab:blue')\n",
+    "#plt.colorbar()\n",
+    "plt.xlabel(r\"$\\lambda$ / m\")\n",
+    "plt.ylabel(r\"$c$ / m/s\")\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([0.05, 0.2])\n",
+    "ax.set_ylim([0, 0.015])\n",
+    "\n",
+    "# uncomment the following things to only store the png (no axes etc.)\n",
+    "# => tikz image must be created manually later\n",
+    "plt.subplots_adjust(bottom = 0)\n",
+    "plt.subplots_adjust(top = 1)\n",
+    "plt.subplots_adjust(right = 1)\n",
+    "plt.subplots_adjust(left = 0)\n",
+    "plt.axis('off')\n",
+    "plt.savefig('/home/rzlin/ca36xymo/tikz/celerity-experiment-e1.png',bbox_inches='tight',transparent=True, pad_inches=0)\n",
+    "\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7168b78e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# grid refinement study: transport rate in SI units\n",
+    "\n",
+    "def readValueOverTimeFromFile(fileName, columnIndex):\n",
+    "    return np.transpose(np.loadtxt(fileName, usecols=(0,columnIndex)))\n",
+    "\n",
+    "t_min = 0\n",
+    "t_max = 50\n",
+    "t_min_for_mean = 10\n",
+    "\n",
+    "dx_coarse = 0.0005\n",
+    "dt_coarse = 2.16294e-05\n",
+    "sedimentTransportRateData_coarse = readValueOverTimeFromFile(\"/simdata/on74yces/resolution5\"+'/bedload.txt',1)\n",
+    "timestep_length_coarse = sedimentTransportRateData_coarse[0,1] - sedimentTransportRateData_coarse[0,0]\n",
+    "timestep_index_min_coarse = int((t_min / dt_coarse) // timestep_length_coarse)\n",
+    "timestep_index_max_coarse = int((t_max / dt_coarse) // timestep_length_coarse)\n",
+    "timestep_index_min_mean_coarse = int((35 / dt_coarse) // timestep_length_coarse)\n",
+    "mean_coarse = np.mean(sedimentTransportRateData_coarse[1,timestep_index_min_mean_coarse:timestep_index_max_coarse] * dx_coarse**2 / dt_coarse)\n",
+    "mean_array_coarse = np.array([[35, 35+1, t_max], [mean_coarse, mean_coarse, mean_coarse]])\n",
+    "\n",
+    "print(sedimentTransportRateData_coarse[0,-1])\n",
+    "print(sedimentTransportRateData_coarse[0,-1] * dt_coarse)\n",
+    "print(mean_coarse)\n",
+    "\n",
+    "dx_medium = 0.00025\n",
+    "dt_medium = 1.08147e-05\n",
+    "sedimentTransportRateData_medium = readValueOverTimeFromFile(\"/simdata/on74yces/resolution10\"+'/bedload.txt',1)\n",
+    "timestep_length_medium = sedimentTransportRateData_medium[0,1] - sedimentTransportRateData_medium[0,0]\n",
+    "timestep_index_min_medium = int((t_min / dt_medium) // timestep_length_medium)\n",
+    "timestep_index_max_medium = int((t_max / dt_medium) // timestep_length_medium)\n",
+    "timestep_index_min_mean_medium = int((t_min_for_mean / dt_medium) // timestep_length_medium)\n",
+    "mean_medium = np.mean(sedimentTransportRateData_medium[1,timestep_index_min_mean_medium:timestep_index_max_medium] * dx_medium**2 / dt_medium)\n",
+    "mean_array_medium = np.array([[t_min_for_mean, t_min_for_mean+1, t_max], [mean_medium, mean_medium, mean_medium]])\n",
+    "\n",
+    "print(sedimentTransportRateData_medium[0,-1])\n",
+    "print(sedimentTransportRateData_medium[0,-1] * dt_medium)\n",
+    "print(mean_medium)\n",
+    "\n",
+    "dx_fine = 0.000125\n",
+    "dt_fine = 5.40734e-06\n",
+    "sedimentTransportRateData_fine = readValueOverTimeFromFile(\"/simdata/on74yces/resolution20\"+'/bedload.txt',1)\n",
+    "timestep_length_fine = sedimentTransportRateData_fine[0,1] - sedimentTransportRateData_fine[0,0]\n",
+    "timestep_index_min_fine = int((t_min / dt_fine) // timestep_length_fine)\n",
+    "timestep_index_max_fine = int((t_max / dt_fine) // timestep_length_fine)\n",
+    "timestep_index_min_mean_fine = int((t_min_for_mean / dt_fine) // timestep_length_fine)\n",
+    "mean_fine = np.mean(sedimentTransportRateData_fine[1,timestep_index_min_mean_fine:timestep_index_max_fine] * dx_fine**2 / dt_fine)\n",
+    "mean_array_fine = np.array([[t_min_for_mean, t_min_for_mean+1, t_max], [mean_fine, mean_fine, mean_fine]])\n",
+    "\n",
+    "print(sedimentTransportRateData_fine[0,-1])\n",
+    "print(sedimentTransportRateData_fine[0,-1] * dt_fine)\n",
+    "print(mean_fine)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(sedimentTransportRateData_coarse[0,timestep_index_min_coarse:timestep_index_max_coarse:4] * dt_coarse,\n",
+    "         sedimentTransportRateData_coarse[1,timestep_index_min_coarse:timestep_index_max_coarse:4] * dx_coarse**2 / dt_coarse,\n",
+    "         '-',label=\"Coarse\",color=\"tab:purple\")\n",
+    "\n",
+    "plt.plot(sedimentTransportRateData_medium[0,timestep_index_min_medium:timestep_index_max_medium:8] * dt_medium,\n",
+    "         sedimentTransportRateData_medium[1,timestep_index_min_medium:timestep_index_max_medium:8] * dx_medium**2 / dt_medium,\n",
+    "         '-',label=\"Medium\")\n",
+    "\n",
+    "plt.plot(sedimentTransportRateData_fine[0,timestep_index_min_fine:timestep_index_max_fine:16] * dt_fine,\n",
+    "         sedimentTransportRateData_fine[1,timestep_index_min_fine:timestep_index_max_fine:16] * dx_fine**2 / dt_fine,\n",
+    "         '-',label=\"Fine\")\n",
+    "\n",
+    "plt.plot(mean_array_coarse[0], mean_array_coarse[1], linestyle=':', color='black', label=\"Coarse (mean)\")\n",
+    "plt.plot(mean_array_medium[0], mean_array_medium[1], linestyle='-', color='black', label=\"Medium (mean)\")\n",
+    "plt.plot(mean_array_fine[0], mean_array_fine[1], linestyle='--', color='black', label=\"Fine (mean)\")\n",
+    "\n",
+    "plt.xlabel(r\"$t$ / s\")\n",
+    "plt.ylabel(r\"$q_s$ / (m$^{2}$/s)\")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "ax.set_xlim([t_min, t_max])\n",
+    "\n",
+    "plt.grid()\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n",
+    "# # For tikzplotlib export:\n",
+    "# # - remove legend\n",
+    "# # - remove mean plots\n",
+    "# import tikzplotlib\n",
+    "# tikzplotlib.clean_figure()\n",
+    "# tikzplotlib.save(\n",
+    "#     '/home/rzlin/ca36xymo/tikz/resolution-study.tex',\n",
+    "#     axis_height = '\\\\figureheight',\n",
+    "#     axis_width = '\\\\figurewidth'\n",
+    "#     )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75ff7fe7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def readValueOverTimeFromFile(fileName, columnIndex):\n",
+    "    return np.loadtxt(fileName, usecols=(0,columnIndex))\n",
+    "\n",
+    "dx = 0.00025\n",
+    "dt = 1.08147e-05\n",
+    "velocity = readValueOverTimeFromFile(\"/simdata/on74yces/merge_E4_flat_new\"+'/fluidInfo.txt',2)\n",
+    "t = velocity[:,0] * dt\n",
+    "velocity = velocity[:,1] * dx/dt\n",
+    "plt.plot(t, velocity)\n",
+    "\n",
+    "plt.ylabel(r\"$U_{x,\\text{l}}$ / (m/s)\")\n",
+    "plt.xlabel(r\"$t$ / s\")\n",
+    "\n",
+    "import tikzplotlib\n",
+    "tikzplotlib.clean_figure()\n",
+    "tikzplotlib.save(\n",
+    "    '/home/rzlin/ca36xymo/tikz/bed-elevation-simulation.tex',\n",
+    "    axis_height = '\\\\figureheight',\n",
+    "    axis_width = '\\\\figurewidth'\n",
+    "    )\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0605ba06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# old stuff below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40b9bb68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(3*0.37 / 0.0083 * 1e-6)/((2.55-1)*9.81*2.9e-3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f88f760",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(np.max(hbOverTime[0,:]),np.min(hbOverTime[0,:]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "345263f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "plt.figure()\n",
+    "sns.distplot(x=fluidSurfaceOverTime[0,:])\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/apps/showcases/BidisperseFluidizedBed/BidisperseFluidizedBedDPM.cpp b/apps/showcases/BidisperseFluidizedBed/BidisperseFluidizedBedDPM.cpp
index 15a4580f421157b2021f527122a4ae9e74fe1258..549c41fd2a5bd45949900abe4e7bfb39f1f04f76 100644
--- a/apps/showcases/BidisperseFluidizedBed/BidisperseFluidizedBedDPM.cpp
+++ b/apps/showcases/BidisperseFluidizedBed/BidisperseFluidizedBedDPM.cpp
@@ -986,20 +986,20 @@ int main( int argc, char **argv ) {
    //////////////////////
 
    // create force field
-   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
-   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
    // create omega field
-   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", omega, field::zyxf, FieldGhostLayers );
+   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", omega, field::fzyx, FieldGhostLayers );
 
    // create the lattice model
    LatticeModel_T latticeModel = LatticeModel_T( omegaFieldID, ForceModel_T( forceFieldID ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel, initialFluidVelocity, real_t(1), FieldGhostLayers, field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel, initialFluidVelocity, real_t(1), FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
@@ -1008,30 +1008,30 @@ int main( int argc, char **argv ) {
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID, uInflow ), "boundary handling" );
 
    // field to store fluid velolcity
-   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
-   BlockDataID oldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
-   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
+   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
+   BlockDataID oldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
+   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
 
    // create pressure field
-   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // create solid volume fraction field
-   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // field to store pressure gradient
-   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store curl of fluid velocity
-   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store velocity gradient
-   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store gradient of stress tensor
-   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store time derivative of fluid velocity
-   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // communication schemes
    pe_coupling::discrete_particle_methods::CombinedReductionFieldCommunication<Vec3Field_T> forceComm( blocks, forceFieldID );
diff --git a/apps/showcases/CMakeLists.txt b/apps/showcases/CMakeLists.txt
index 3807122564dc25156a976795af060a310583bb0a..68e62ec3ace422008b782550cd93570ab9f49358 100644
--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -9,9 +9,12 @@ add_subdirectory( Mixer )
 add_subdirectory( ParticlePacking )
 add_subdirectory( PegIntoSphereBed )
 if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_PYTHON )
-add_subdirectory( PhaseFieldAllenCahn )
+   add_subdirectory( PhaseFieldAllenCahn )
 endif()
 if ( WALBERLA_BUILD_WITH_CODEGEN AND NOT WALBERLA_BUILD_WITH_OPENMP)
    add_subdirectory( PorousMedia )
 endif()
+if ( WALBERLA_BUILD_WITH_CODEGEN )
+   add_subdirectory( Antidunes )
+endif()
 
diff --git a/apps/showcases/CombinedResolvedUnresolved/CombinedResolvedUnresolved.cpp b/apps/showcases/CombinedResolvedUnresolved/CombinedResolvedUnresolved.cpp
index 026171328e9f7141679421e05c05b43e7556d2ac..4277a912c6282f3cee0c012c009521a15566210a 100644
--- a/apps/showcases/CombinedResolvedUnresolved/CombinedResolvedUnresolved.cpp
+++ b/apps/showcases/CombinedResolvedUnresolved/CombinedResolvedUnresolved.cpp
@@ -576,23 +576,23 @@ int main(int argc, char** argv)
 
    // create force field
    BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >(blocks, "force field", Vector3< real_t >(real_t(0)),
-                                                                 field::zyxf, FieldGhostLayers);
+                                                                 field::fzyx, FieldGhostLayers);
 
    // create omega field
    BlockDataID omegaFieldID =
-      field::addToStorage< ScalarField_T >(blocks, "omega field", real_t(0), field::zyxf, FieldGhostLayers);
+      field::addToStorage< ScalarField_T >(blocks, "omega field", real_t(0), field::fzyx, FieldGhostLayers);
 
    // create the lattice model
    LatticeModel_T latticeModel = LatticeModel_T(omegaFieldID, ForceModel_T(forceFieldID));
 
    // add PDF field
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(
-      blocks, "pdf field (zyxf)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::zyxf);
+      blocks, "pdf field (fzyx)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::fzyx);
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage< BodyField_T >(blocks, "body field", nullptr, field::zyxf);
+   BlockDataID bodyFieldID = field::addToStorage< BodyField_T >(blocks, "body field", nullptr, field::fzyx);
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
@@ -600,44 +600,44 @@ int main(int argc, char** argv)
 
    // field to store fluid velolcity
    BlockDataID velocityFieldID =
-      field::addToStorage< Vec3Field_T >(blocks, "velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers);
+      field::addToStorage< Vec3Field_T >(blocks, "velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers);
 
    BlockDataID oldVelocityFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers);
+      blocks, "old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers);
    BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "swapped old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers);
+      blocks, "swapped old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers);
 
    // field to store curl of fluid velocity
    BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "velocity curl field", Vector3< real_t >(real_c(0)), field::zyxf, FieldGhostLayers);
+      blocks, "velocity curl field", Vector3< real_t >(real_c(0)), field::fzyx, FieldGhostLayers);
 
    // field to store velocity gradient
    BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >(
-      blocks, "velocity gradient field", Matrix3< real_t >(real_c(0)), field::zyxf, FieldGhostLayers);
+      blocks, "velocity gradient field", Matrix3< real_t >(real_c(0)), field::fzyx, FieldGhostLayers);
 
    // field to store time derivative of fluid velocity
    BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "time derivative velocity field", Vector3< real_t >(real_c(0)), field::zyxf, FieldGhostLayers);
+      blocks, "time derivative velocity field", Vector3< real_t >(real_c(0)), field::fzyx, FieldGhostLayers);
 
    // create solid volume fraction field
    BlockDataID svfFieldID =
-      field::addToStorage< ScalarField_T >(blocks, "svf field", real_t(0), field::zyxf, FieldGhostLayers);
+      field::addToStorage< ScalarField_T >(blocks, "svf field", real_t(0), field::fzyx, FieldGhostLayers);
 
    // create pressure field
    BlockDataID pressureFieldID =
-      field::addToStorage< ScalarField_T >(blocks, "pressure field", real_t(0), field::zyxf, FieldGhostLayers);
+      field::addToStorage< ScalarField_T >(blocks, "pressure field", real_t(0), field::fzyx, FieldGhostLayers);
 
    // field to store pressure gradient
    BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "pressure gradient field", Vector3< real_t >(real_c(0)), field::zyxf, FieldGhostLayers);
+      blocks, "pressure gradient field", Vector3< real_t >(real_c(0)), field::fzyx, FieldGhostLayers);
 
    BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "drag force field", Vector3< real_t >(real_t(0)), field::zyxf, FieldGhostLayers);
+      blocks, "drag force field", Vector3< real_t >(real_t(0)), field::fzyx, FieldGhostLayers);
 
    BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "am force field", Vector3< real_t >(real_t(0)), field::zyxf, FieldGhostLayers);
+      blocks, "am force field", Vector3< real_t >(real_t(0)), field::fzyx, FieldGhostLayers);
    BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >(
-      blocks, "lift force field", Vector3< real_t >(real_t(0)), field::zyxf, FieldGhostLayers);
+      blocks, "lift force field", Vector3< real_t >(real_t(0)), field::fzyx, FieldGhostLayers);
 
    // map planes into the LBM simulation -> act as no-slip boundaries
    pe_coupling::mapBodies< BoundaryHandling_T >(*blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage,
diff --git a/apps/showcases/FluidizedBed/FluidizedBedMEM.cpp b/apps/showcases/FluidizedBed/FluidizedBedMEM.cpp
index d9d3bc20d9fca057136e56a836087ed44aca96e1..2eb0e481e35f5fe2f25967aba47952d0d1705062 100644
--- a/apps/showcases/FluidizedBed/FluidizedBedMEM.cpp
+++ b/apps/showcases/FluidizedBed/FluidizedBedMEM.cpp
@@ -571,14 +571,14 @@ int main(int argc, char** argv)
 
    // add PDF field
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(blocks, "pdf field", latticeModel, inflowVec,
-                                                                        densityFluid, uint_t(1), field::zyxf);
+                                                                        densityFluid, uint_t(1), field::fzyx);
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    // add particle field
    BlockDataID particleFieldID = field::addToStorage< lbm_mesapd_coupling::ParticleField_T >(
-      blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers);
+      blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers);
 
    // add boundary handling
    using BoundaryHandling_T       = MyBoundaryHandling< ParticleAccessor_T >::Type;
diff --git a/apps/showcases/LightRisingParticleInFluidAMR/LightRisingParticleInFluidAMR.cpp b/apps/showcases/LightRisingParticleInFluidAMR/LightRisingParticleInFluidAMR.cpp
index 07df97e31851603d56b89da263f29570dfe91df4..86d1978670867aa33d8011aff1a4b46dd27660a8 100644
--- a/apps/showcases/LightRisingParticleInFluidAMR/LightRisingParticleInFluidAMR.cpp
+++ b/apps/showcases/LightRisingParticleInFluidAMR/LightRisingParticleInFluidAMR.cpp
@@ -896,15 +896,15 @@ int main(int argc, char** argv) {
       shared_ptr< lbm::internal::PdfFieldHandling< LatticeModel_T > > dataHandling =
               make_shared< lbm::internal::PdfFieldHandling< LatticeModel_T > >(blocks, latticeModel, false,
                       Vector3<real_t>(real_t(0)), real_t(1),
-                      FieldGhostLayers, field::zyxf );
+                      FieldGhostLayers, field::fzyx );
 
       pdfFieldID = blocks->loadBlockData( readCheckPointFileName+"_lbm.txt", dataHandling, "pdf field" );
 
    } else {
       // add PDF field
-      pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(blocks, "pdf field (zyxf)", latticeModel,
+      pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(blocks, "pdf field (fzyx)", latticeModel,
               Vector3<real_t>(real_t(0)), real_t(1),
-              FieldGhostLayers, field::zyxf);
+              FieldGhostLayers, field::fzyx);
    }
 
    // add flag field
@@ -916,11 +916,11 @@ int main(int argc, char** argv) {
    // add particle field
    BlockDataID particleFieldID = field::addToStorage<ParticleField_T>(blocks, "particle field",
          accessor->getInvalidUid(),
-         field::zyxf,
+         field::fzyx,
          FieldGhostLayers);
 
    // add velocity field and utility
-   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::zyxf, uint_t(2) );
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::fzyx, uint_t(2) );
 
    typedef lbm::VelocityFieldWriter< PdfField_T, VelocityField_T > VelocityFieldWriter_T;
    BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
@@ -929,7 +929,7 @@ int main(int argc, char** argv) {
    velocityCommunicationScheme->addPackInfo( make_shared< field::refinement::PackInfo<VelocityField_T, stencil::D3Q27> >( velocityFieldID ) );
 
    // add q criterion field (only needed for mesh output)
-   BlockDataID qCriterionFieldID = field::addToStorage<QCriterionField_T>(blocks, "q criterion field", real_t(0), field::zyxf, uint_t(1));
+   BlockDataID qCriterionFieldID = field::addToStorage<QCriterionField_T>(blocks, "q criterion field", real_t(0), field::fzyx, uint_t(1));
 
    typedef lbm::QCriterionFieldWriter<VelocityField_T, QCriterionField_T, FluidFilter_T> QCriterionFieldWriter_T;
    BlockSweepWrapper<QCriterionFieldWriter_T> qCriterionFieldWriter(blocks, QCriterionFieldWriter_T(blocks, velocityFieldID,
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
index 9116f0b19b73f55ecca6994c4668d0bf85fe15e3..61e4464d18c4ea1a5ee056f26792c60f6af71250 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
@@ -4,18 +4,18 @@ waLBerla_link_files_to_builddir(*.obj)
 
 waLBerla_generate_target_from_python(NAME PhaseFieldCodeGenGPU
         FILE multiphase_codegen.py
-        OUT_FILES initialize_phase_field_distributions.cu initialize_phase_field_distributions.h
-        initialize_velocity_based_distributions.cu initialize_velocity_based_distributions.h
-        phase_field_LB_step.cu phase_field_LB_step.h
-        phase_field_LB_NoSlip.cu phase_field_LB_NoSlip.h
-        hydro_LB_step.cu hydro_LB_step.h
-        hydro_LB_NoSlip.cu hydro_LB_NoSlip.h
-        PackInfo_phase_field_distributions.cu PackInfo_phase_field_distributions.h
-        PackInfo_phase_field.cu PackInfo_phase_field.h
-        PackInfo_velocity_based_distributions.cu PackInfo_velocity_based_distributions.h
-        ContactAngle.cu ContactAngle.h
+        OUT_FILES initialize_phase_field_distributions.${CODEGEN_FILE_SUFFIX} initialize_phase_field_distributions.h
+        initialize_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} initialize_velocity_based_distributions.h
+        phase_field_LB_step.${CODEGEN_FILE_SUFFIX} phase_field_LB_step.h
+        phase_field_LB_NoSlip.${CODEGEN_FILE_SUFFIX} phase_field_LB_NoSlip.h
+        hydro_LB_step.${CODEGEN_FILE_SUFFIX} hydro_LB_step.h
+        hydro_LB_NoSlip.${CODEGEN_FILE_SUFFIX} hydro_LB_NoSlip.h
+        PackInfo_phase_field_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field_distributions.h
+        PackInfo_phase_field.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field.h
+        PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h
+        ContactAngle.${CODEGEN_FILE_SUFFIX} ContactAngle.h
         GenDefines.h)
 
 waLBerla_add_executable(NAME multiphaseGPU
         FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp util.cpp multiphase_codegen.py
-        DEPENDS blockforest core cuda field postprocessing python_coupling lbm geometry timeloop PhaseFieldCodeGenGPU)
+        DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop PhaseFieldCodeGenGPU)
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index 8d5f3c49869c289c0e93130d5dfc07b6e9158f0f..2800b98cb65008ef5d66aa98853fb5589087c8d5 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -25,11 +25,11 @@
 #include "core/math/Constants.h"
 #include "core/timing/RemainingTimeLogger.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "cuda/NVTX.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/communication/UniformGPUScheme.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/NVTX.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
 
 #include "field/AddToStorage.h"
 #include "field/FlagField.h"
@@ -67,13 +67,13 @@ using namespace walberla;
 
 using FlagField_T = FlagField< uint8_t >;
 
-typedef cuda::GPUField< real_t > GPUField;
-typedef cuda::GPUField< uint8_t > GPUField_int;
+typedef gpu::GPUField< real_t > GPUField;
+typedef gpu::GPUField< uint8_t > GPUField_int;
 
 int main(int argc, char** argv)
 {
    mpi::Environment Env(argc, argv);
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
    exportDataStructuresToPython();
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -114,17 +114,17 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
-         cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
       // Flag field
       BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-      BlockDataID flagFieldID_gpu = cuda::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
+      BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
 
       auto physical_parameters     = config->getOneBlock("PhysicalParameters");
       const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
@@ -195,11 +195,11 @@ int main(int argc, char** argv)
       //////////////////////
       int streamLowPriority  = 0;
       int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 
       auto UniformGPUSchemeVelocityDistributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
@@ -211,7 +211,7 @@ int main(int argc, char** argv)
          std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); });
 
       auto UniformGPUSchemePhaseField =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
       auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); });
@@ -220,7 +220,7 @@ int main(int argc, char** argv)
       auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); });
 
       auto UniformGPUSchemePhaseFieldDistributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
+         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
@@ -255,7 +255,7 @@ int main(int argc, char** argv)
          }
          geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
-      cuda::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldID_gpu, flagFieldID);
+      gpu::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldID_gpu, flagFieldID);
 
       lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, lb_phase_field_gpu);
       lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, lb_velocity_field_gpu);
@@ -293,8 +293,8 @@ int main(int argc, char** argv)
             smear_interface();
          }
       }
-      cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs")
       for (auto& block : *blocks)
@@ -314,9 +314,9 @@ int main(int argc, char** argv)
             [&]() {
                if (timeloop.getCurrentTimeStep() % dbWriteFrequency == 0)
                {
-                  cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-                  cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
-                  WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+                  gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
+                  gpu::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+                  WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
                   if (scenario == 4)
                   {
@@ -411,17 +411,17 @@ int main(int argc, char** argv)
          timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
          "remaining time logger");
 
-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          const std::string path = "vtk_out";
          auto vtkOutput         = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, path,
                                                          "simulation_step", false, true, true, false, 0);
          vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+            gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
+            gpu::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
          });
-         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T, float > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
@@ -435,20 +435,20 @@ int main(int argc, char** argv)
          timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
-      lbm::PerformanceEvaluation< FlagField_T > performance(blocks, flagFieldID, fluidFlagUID);
+      lbm::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
       WcTimingPool timeloopTiming;
       WcTimer simTimer;
 
       WALBERLA_MPI_WORLD_BARRIER()
       cudaDeviceSynchronize();
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
       simTimer.start();
       timeloop.run(timeloopTiming);
 
       cudaDeviceSynchronize();
-      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+      WALBERLA_GPU_CHECK(cudaPeekAtLastError())
 
       simTimer.end();
       auto time = real_c(simTimer.max());
diff --git a/apps/showcases/PorousMedia/PorousMedia.cpp b/apps/showcases/PorousMedia/PorousMedia.cpp
index bdd186f6e87da0f3377ef58a7e04b864af20b11e..eff881fcd7dae02f6bf0efd66aa07141fe22817d 100644
--- a/apps/showcases/PorousMedia/PorousMedia.cpp
+++ b/apps/showcases/PorousMedia/PorousMedia.cpp
@@ -1371,7 +1371,7 @@ int main(int argc, char** argv)
 
    // add field for particle creation (bodies)
    BlockDataID bodyFieldID =
-      field::addToStorage< BodyField_T >(blocks, "body field", nullptr, field::zyxf, FieldGhostLayers, false);
+      field::addToStorage< BodyField_T >(blocks, "body field", nullptr, field::fzyx, FieldGhostLayers, false);
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/apps/tools/CMakeLists.txt b/apps/tools/CMakeLists.txt
index 3d4e98e5c818c1baebcc4bd1399c44ad9b10779e..eaf667f372496c46e6617c16593844ee58a5eba1 100644
--- a/apps/tools/CMakeLists.txt
+++ b/apps/tools/CMakeLists.txt
@@ -1 +1,2 @@
+add_subdirectory( MixedPrecision )
 add_subdirectory( povrayFileCompressor )
\ No newline at end of file
diff --git a/apps/tools/MixedPrecision/CMakeLists.txt b/apps/tools/MixedPrecision/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f14c7e5a2b3ea377db65597d4bfe37b3f38055d
--- /dev/null
+++ b/apps/tools/MixedPrecision/CMakeLists.txt
@@ -0,0 +1,3 @@
+waLBerla_add_executable ( NAME CheckFP16
+        FILES CheckFP16.cpp
+        DEPENDS core )
\ No newline at end of file
diff --git a/apps/tools/MixedPrecision/CheckFP16.cpp b/apps/tools/MixedPrecision/CheckFP16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c436a81f2e6a4e1286ec8424c116270e169ab661
--- /dev/null
+++ b/apps/tools/MixedPrecision/CheckFP16.cpp
@@ -0,0 +1,197 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CheckFP16.cpp
+//! \brief Checks the availability of float16 (half precision) and verifies some properties.
+//! \author Nils Kohl <nils.kohl@fau.de>
+//
+//======================================================================================================================
+
+#include <core/DataTypes.h>
+#include <core/Environment.h>
+#include <core/logging/Logging.h>
+#include <core/perf_analysis/extern/likwid.h>
+
+namespace walberla
+{
+
+template< typename T >
+void kernel(T* v, T* vv, T* r, size_t vsize)
+{
+   for (size_t i = 0; i < vsize; i++)
+   {
+      r[i] = v[i] + vv[i];
+   }
+}
+
+int main(int argc, char** argv)
+{
+   Environment const env(argc, argv);
+
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+   WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ")
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+
+#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.")
+   WALBERLA_LOG_INFO_ON_ROOT("   Reconfigure by setting the respective CMake variable to ON.")
+   WALBERLA_LOG_INFO_ON_ROOT("   At the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT.")
+
+   return EXIT_FAILURE;
+#else
+   WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ")
+   const auto sfloat64 = sizeof(float64);
+   const auto sfloat32 = sizeof(float32);
+   const auto sfloat16 = sizeof(float16);
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float64 ) == " << sfloat64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float32 ) == " << sfloat32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float16 ) == " << sfloat16)
+   if (sfloat64 != 8 || sfloat32 != 4 || sfloat16 != 2)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("   Your types don't seem to have the expected sizes.")
+      return EXIT_FAILURE;
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("   -> works out!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ")
+   const float64 a64 = 42;
+   const float32 a32 = 42;
+   const float16 a16 = 42;
+   WALBERLA_LOG_INFO_ON_ROOT("   + float64: " << a64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float32: " << a32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float16: " << (double) a16)
+   WALBERLA_LOG_INFO_ON_ROOT("   Casting and output compiles.")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ")
+   const auto x   = float16(1.2);
+   const auto y   = float16(-1.8);
+   const float64 z   = -0.6;
+   const float16 sum = x + y;
+   WALBERLA_LOG_INFO_ON_ROOT("     " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << "")
+   WALBERLA_CHECK(std::abs((float64) sum - z) < 1e-3, "Float16 arithmetic is broken.");
+   WALBERLA_LOG_INFO_ON_ROOT("")
+
+#   ifdef WALBERLA_BUILD_WITH_LIKWID_MARKERS
+   WALBERLA_LOG_INFO_ON_ROOT(" - Memory traffic test. You have built with likwid enabled. Make sure to run ")
+   WALBERLA_LOG_INFO_ON_ROOT("     $ likwid-perfctr -g MEM_DP    -m ./CheckFP16")
+   WALBERLA_LOG_INFO_ON_ROOT("   to compare the memory traffic, and")
+   WALBERLA_LOG_INFO_ON_ROOT("     $ likwid-perfctr -g FLOPS_AVX -m ./CheckFP16")
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   for the stream-triad-like benchmark to check whether automatic float32 vectorization works.")
+   WALBERLA_LOG_INFO_ON_ROOT("")
+   WALBERLA_LOG_INFO_ON_ROOT("   The only real benefit of using float16 is reduced memory traffic since internally,\n"
+                             "all arithmetic operations are preceded by promotions to float32 (likely - depends on "
+                             "the machine).")
+   WALBERLA_LOG_INFO_ON_ROOT("   + Stream test ... ")
+
+   LIKWID_MARKER_INIT;
+   LIKWID_MARKER_THREADINIT;
+
+   LIKWID_MARKER_REGISTER("float64-mem");
+   LIKWID_MARKER_REGISTER("float32-mem");
+   LIKWID_MARKER_REGISTER("float16-mem");
+
+   LIKWID_MARKER_REGISTER("float64-vec");
+   LIKWID_MARKER_REGISTER("float32-vec");
+   LIKWID_MARKER_REGISTER("float16-vec");
+
+   size_t vsize = 100000000;
+
+   std::vector< float64 > v64(vsize, 0.01);
+   std::vector< float32 > v32(vsize, 0.01f);
+   std::vector< float16 > v16(vsize, float16(0.01));
+
+   std::vector< float64 > vv64(vsize, 0.02);
+   std::vector< float32 > vv32(vsize, 0.02f);
+   std::vector< float16 > vv16(vsize, float16(0.02));
+
+   std::vector< float64 > r64(vsize);
+   std::vector< float32 > r32(vsize);
+   std::vector< float16 > r16(vsize);
+
+   LIKWID_MARKER_START("float64-mem");
+   float64 sum64 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum64 += v64[j]; }
+      else { sum64 -= v64[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float64 vector entries. Should be zero up to rounding errors: " << sum64);
+   LIKWID_MARKER_STOP("float64-mem");
+
+   // Start measurements
+   LIKWID_MARKER_START("float32-mem");
+   float32 sum32 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum32 += v32[j]; }
+      else { sum32 -= v32[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float32 vector entries. Should be zero up to rounding errors: " << sum32);
+   LIKWID_MARKER_STOP("float32-mem");
+
+   // Start measurements
+   LIKWID_MARKER_START("float16-mem");
+   float16 sum16 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum16 += v16[j]; }
+      else { sum16 -= v16[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float16 vector entries. Should be zero up to rounding errors: " << (double) sum16);
+   LIKWID_MARKER_STOP("float16-mem");
+
+   WALBERLA_LOG_INFO_ON_ROOT("   + Vectorization test ... ")
+
+   float64* v64_ptr  = v64.data();
+   float64* vv64_ptr = vv64.data();
+   float64* r64_ptr  = r64.data();
+   LIKWID_MARKER_START("float64-vec");
+   kernel(v64_ptr, vv64_ptr, r64_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float64 vector sum: " << r64[vsize / 2]);
+   LIKWID_MARKER_STOP("float64-vec");
+
+   float32* v32_ptr  = v32.data();
+   float32* vv32_ptr = vv32.data();
+   float32* r32_ptr  = r32.data();
+   LIKWID_MARKER_START("float32-vec");
+   kernel(v32_ptr, vv32_ptr, r32_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float32 vector sum: " << r32[vsize / 2]);
+   LIKWID_MARKER_STOP("float32-vec");
+
+   float16* v16_ptr  = v16.data();
+   float16* vv16_ptr = vv16.data();
+   float16* r16_ptr  = r16.data();
+   LIKWID_MARKER_START("float16-vec");
+   kernel(v16_ptr, vv16_ptr, r16_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float16 vector sum: " << (double) r16[vsize / 2]);
+   LIKWID_MARKER_STOP("float16-vec");
+
+   LIKWID_MARKER_CLOSE;
+
+#   else
+   WALBERLA_LOG_INFO_ON_ROOT(" - Build and run with likwid to run memory traffic test.")
+#   endif
+#endif
+   return EXIT_SUCCESS;
+}
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
diff --git a/apps/tutorials/CMakeLists.txt b/apps/tutorials/CMakeLists.txt
index 4eb4eb533bf24491a8ea216ac08a5f05e17ec436..fbb863629caddc3622acbef3da29d5a21162947e 100644
--- a/apps/tutorials/CMakeLists.txt
+++ b/apps/tutorials/CMakeLists.txt
@@ -3,8 +3,8 @@ add_subdirectory(lbm)
 add_subdirectory(mesa_pd)
 add_subdirectory(pde)
 add_subdirectory(pe)
-if( WALBERLA_BUILD_WITH_CUDA )
-    add_subdirectory(cuda)
+if( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+    add_subdirectory(gpu)
 endif()
 if( WALBERLA_BUILD_WITH_CODEGEN )
     add_subdirectory(codegen)
diff --git a/apps/tutorials/basics/03_GameOfLife.cpp b/apps/tutorials/basics/03_GameOfLife.cpp
index 868e21bcd7f8c5e62d49f984958fadb96a16221c..780f053ba382d82f428ee013264aa75fe4a87f51 100644
--- a/apps/tutorials/basics/03_GameOfLife.cpp
+++ b/apps/tutorials/basics/03_GameOfLife.cpp
@@ -190,7 +190,7 @@ int main( int argc, char ** argv )
    BlockDataID fieldID = field::addToStorage<ScalarField>( blocks,      // block storage
                                                            "My Field",  // name
                                                            real_c(0),   // initial value
-                                                           field::zyxf, // layout (not relevant for scalar fields)
+                                                           field::fzyx, // layout (not relevant for scalar fields)
                                                            uint_c(1)    // number of ghost layers
                                                            );
 
diff --git a/apps/tutorials/codegen/01_CodegenHeatEquation.dox b/apps/tutorials/codegen/01_CodegenHeatEquation.dox
index ad521e0668e40d438fb7d21723be84bdf26a733e..653ec548c58f3c5ad4aa719bab1f6c1d62430746 100644
--- a/apps/tutorials/codegen/01_CodegenHeatEquation.dox
+++ b/apps/tutorials/codegen/01_CodegenHeatEquation.dox
@@ -90,7 +90,7 @@ with CodeGeneration() as ctx:
     generate_sweep(ctx, 'HeatEquationKernel', ac)
 \endcode
 
-The `CodeGeneration` context and the function `generate_sweep` are provided by waLBerla. `generate_sweep` takes the desired class name and the update rule. It then generates the kernel and builds a C++ class around it. We choose `HeatEquationKernel` as the class name. Through the `CodeGeneration` context, the waLBerla build system gives us access to a list of CMake variables. With `ctx.cuda` for example, we can ask if waLBerla was built with support for using NVIDIA GPUs and thus we can directly generate CUDA code with pystencils. In the scope of this first tutorial, we will not make use of this.
+The `CodeGeneration` context and the function `generate_sweep` are provided by waLBerla. `generate_sweep` takes the desired class name and the update rule. It then generates the kernel and builds a C++ class around it. We choose `HeatEquationKernel` as the class name. Through the `CodeGeneration` context, the waLBerla build system gives us access to a list of CMake variables. With `ctx.gpu` for example, we can ask if waLBerla was built with support for using GPUs (either by using CUDA for NVIDIA GPUs or HIP for AMD GPUs) and thus we can directly generate device code with pystencils. In the scope of this first tutorial, we will not make use of this.
 
 The code generation script will later be called by the build system while compiling the application. The complete script looks like this:
 
diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
index 2a83abba2c9c05726d79ed96caa8b6bfb41b8d9c..c7b9c902488989f8dfca6a9b5078619b6851dc34 100644
--- a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
@@ -114,16 +114,16 @@ int main(int argc, char** argv)
 
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
    const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    ///////////////////
    /// Field Setup ///
    ///////////////////
 
-   LatticeModel_T latticeModel = LatticeModel_T(omega);
-   BlockDataID pdfFieldId      = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, field::fzyx);
-   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   LatticeModel_T const latticeModel = LatticeModel_T(omega);
+   BlockDataID const pdfFieldId      = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, field::fzyx);
+   BlockDataID const flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    ////////////////////////
    /// Shear Flow Setup ///
@@ -131,7 +131,7 @@ int main(int argc, char** argv)
 
    auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
    ShearFlowInit shearFlowInitFunc(blocks, shearFlowSetup);
-   lbm::initializer::PdfFieldInitializer< LatticeModel_T > fieldInit(pdfFieldId, blocks);
+   lbm::initializer::PdfFieldInitializer< LatticeModel_T > const fieldInit(pdfFieldId, blocks);
    fieldInit.initDensityAndVelocity(shearFlowInitFunc);
 
    /////////////////////////
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
index 5b7790c7d31b6730f410a1b99460c3aa51c1cff5..1856106c5b61880752c7216ee10eabda140485b1 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -22,13 +22,10 @@
 
 #include "core/all.h"
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #endif
 
 #include "domain_decomposition/all.h"
@@ -71,8 +68,8 @@ typedef walberla::uint8_t flag_t;
 typedef FlagField< flag_t > FlagField_T;
 typedef lbm::CumulantMRTNoSlip NoSlip_T;
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+typedef gpu::GPUField< real_t > GPUField;
 #endif
 
 //////////////////////////////////////////
@@ -84,8 +81,8 @@ void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& block
 {
    math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42));
 
-   real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
-   real_t noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
+   real_t const velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
+   real_t const noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
 
    auto n_y = real_c(blocks->getNumberOfYCells());
 
@@ -128,8 +125,8 @@ int main(int argc, char** argv)
 
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
    const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
    const uint_t VTKwriteFrequency = parameters.getParameter< uint_t >("VTKwriteFrequency", 1000);
 
    ////////////////////////////////////
@@ -138,16 +135,16 @@ int main(int argc, char** argv)
 
    // Common Fields
    BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   BlockDataID const flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
    // GPU Field for PDFs
-   BlockDataID pdfFieldId = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+   BlockDataID const pdfFieldId = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
       blocks, "pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t(1));
 
    // GPU Velocity Field
    BlockDataID velocityFieldIdGPU =
-      cuda::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
+      gpu::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
 #else
    // CPU Field for PDFs
    BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx);
@@ -157,11 +154,11 @@ int main(int argc, char** argv)
    auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
    initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup);
 
-   real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
+   real_t const rho = shearFlowSetup.getParameter("rho", real_c(1.0));
 
    // pdfs setup
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   gpu::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
    pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho);
 #else
    pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
@@ -176,10 +173,10 @@ int main(int argc, char** argv)
    /// Sweep ///
    /////////////
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   pystencils::CumulantMRTSweep const CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
 #else
-   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+   pystencils::CumulantMRTSweep const CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
 #endif
 
    /////////////////////////
@@ -204,8 +201,9 @@ int main(int argc, char** argv)
    SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 
    // Communication
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::communication::UniformGPUScheme< Stencil_T > com(blocks, 0);
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+   const bool sendDirectlyFromGPU = false;
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU);
    com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
    auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
 #else
@@ -227,10 +225,10 @@ int main(int argc, char** argv)
       auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0,
                                                       false, path, "simulation_step", false, true, true, false, 0);
 
-#if defined(WALBERLA_BUILD_WITH_CUDA)
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
       // Copy velocity data to CPU before output
       vtkOutput->addBeforeFunction(
-         [&]() { cuda::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
+         [&]() { gpu::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
 #endif
 
       auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
index f3882b26c5f1cebe8a426661d5ddbd61b9bc1a15..0e3ace4036f4d7003738532adf3f225b0e8f39c4 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
@@ -7,7 +7,7 @@ namespace walberla{
 
 This tutorial demonstrates how to use [pystencils](https://pycodegen.pages.i10git.cs.fau.de/pystencils) and [lbmpy](https://pycodegen.pages.i10git.cs.fau.de/lbmpy) to generate highly optimised and hardware-specific Lattice Boltzmann simulation code within the waLBerla framework. Other than in \ref tutorial_codegen02, we will be generating a full LBM sweep instead of a lattice model class. Furthermore, we will generate a communication pack info class and a sweep to initialise the PDF field. A hardware-specific implementation of a NoSlip boundary handler will also be generated. Those components will then be combined in a waLBerla application for simulating the same shear flow scenario as in the previous tutorial.
 
-For large-scale LB simulations, the highly parallel design of a general-purpose graphics processing unit (GPGPU) can yield significant improvements in performance. The waLBerla framework relies on CUDA to run simulations on NVIDIA GPUs. In this tutorial, we will also show how code generation can be used to generate native CUDA code for different kinds of kernels.
+For large-scale LB simulations, the highly parallel design of a general-purpose graphics processing unit (GPGPU) can yield significant improvements in performance. The waLBerla framework relies on CUDA/HIP to run simulations on NVIDIA or AMD GPUs. In this tutorial, we will also show how code generation can be used to generate native CUDA/HIP code for different kinds of kernels.
 
 In this tutorial, we will be using the more advanced cumulant-based multiple-relaxation-time (MRT) collision operator. Instead of relaxing the entire distribution functions toward their equilibrium values, their [cumulants](https://en.wikipedia.org/wiki/Cumulant) are relaxed with individual relaxation rates. We will also use the D2Q9 velocity set. For this velocity set, the zeroth- and first-order cumulants correspond to density and momentum which are conserved during collisions, so their relaxation rates can be set to zero. We will specify one common relaxation rate \f$ \omega \f$ for the three second-order cumulants to ensure the correct viscosity of the fluid; the higher-order cumulants will be set to their equilibrium values which correspond to a relaxation rate of 1.
 
@@ -64,7 +64,7 @@ pdfs_setter = macroscopic_values_setter(lbm_method,
                                         pdfs.center_vector)
 \endcode
 
-Everything is now prepared to generate the actual C++ code. We create the code generation context and evaluate the `ctx.cuda` flag to find out if waLBerla is configured to build GPU code. If CUDA is enabled, we set the `target` to `gpu`; otherwise to `cpu`.  The target is then passed to all code generation functions. If GPU code is to be generated, the generated classes will be implemented in `*.cu` files, and their sweeps will run on the GPU.
+Everything is now prepared to generate the actual C++ code. We create the code generation context and evaluate the `ctx.gpu` flag to find out if waLBerla is configured to build GPU code. If CUDA/HIP is enabled, we set the `target` to `gpu`; otherwise to `cpu`.  The target is then passed to all code generation functions. If GPU code is to be generated, the generated classes will be implemented in `*.cu` files for CUDA device code or `*.cpp` for HIP device code, and their sweeps will run on the GPU.
 
 Several functions from `pystencils_walberla` and `lbmpy_walberla` are called to generate the classes:
 
@@ -75,10 +75,7 @@ Several functions from `pystencils_walberla` and `lbmpy_walberla` are called to
 
 \code{.py}
 with CodeGeneration() as ctx:
-    if ctx.cuda:
-        target = ps.Target.GPU
-    else:
-        target = ps.Target.CPU
+    target = ps.Target.GPU if ctx.gpu else ps.Target.CPU
 
     #   LBM Sweep
     generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
@@ -93,7 +90,7 @@ with CodeGeneration() as ctx:
     generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target)
 \endcode
 
-As in \ref tutorial_codegen02, the classes generated by the above code need to be registered with CMake using the `walberla_generate_target_from_python` macro. Since the source file extension is different if CUDA code is generated (`*.cu` instead of `*.cpp`), the code generation target needs to be added twice. During the build process, the correct target is selected through the surrounding `if(WALBERLA_BUILD_WITH_CUDA)` block. Furthermore, the application depends on `cuda`, which is used from the waLBerla backend.
+As in \ref tutorial_codegen02, the classes generated by the above code need to be registered with CMake using the `walberla_generate_target_from_python` macro. Since the source file extension for device code can be different we use the macro `CODEGEN_FILE_SUFFIX`. This macro essentially switches to `*.cu` only if `CUDA` is used. During the build process, the correct target is selected through the surrounding `if(WALBERLA_BUILD_WITH_GPU_SUPPORT)` block, which makes the application depend on `gpu`. This referees to the `gpu` files in waLBerla.
 
 \section advancedlbmcodegen_application The waLBerla application
 
@@ -226,9 +223,9 @@ After the velocity field has been initialised, the generated `InitialPDFsSetter`
 
 The simulation is now ready to run.
 
-\subsection advancedlbmpy_cuda Differences in the GPU application
+\subsection advancedlbmpy_gpu Differences in the GPU application
 
-If CUDA is enabled, some implementation details need to be different from a CPU-only version. This mainly concerns the creation and management of fields, MPI communication and VTK output. Since the initialisation, LBM and NoSlip sweeps run entirely on the GPU, the PDF field has to be set up only in graphics memory. In contrast to that is the velocity field required by CPU and GPU. The shear flow velocity profile is constructed by CPU code before the initialisation kernel maps it onto the PDF field on the GPU. Also, the VTK output routines which run on the CPU need to read the velocity field. It thus needs to be created twice: Once in the main memory, and once in GPU memory. It is then copied on-demand from the GPU to the CPU. Furthermore, we create a flag field, which is only needed on the CPU. After the initialisation, we use it to create the index-vectors for the boundary-handling. The index vectors are then transferred to the GPU and not the entire flag field.
+If `GPU_SUPPORT` is enabled, some implementation details need to be different from a CPU-only version. This mainly concerns the creation and management of fields, MPI communication and VTK output. Since the initialisation, LBM and NoSlip sweeps run entirely on the GPU, the PDF field has to be set up only in graphics memory. In contrast to that is the velocity field required by CPU and GPU. The shear flow velocity profile is constructed by CPU code before the initialisation kernel maps it onto the PDF field on the GPU. Also, the VTK output routines which run on the CPU need to read the velocity field. It thus needs to be created twice: Once in the main memory, and once in GPU memory. It is then copied on-demand from the GPU to the CPU. Furthermore, we create a flag field, which is only needed on the CPU. After the initialisation, we use it to create the index-vectors for the boundary-handling. The index vectors are then transferred to the GPU and not the entire flag field.
 
 For the largest part, though, the C++ code is identical. The code snippets presented above represent only the CPU variant of the code. The GPU implementation can be found in the source file 03_AdvancedLBMCodegen.cpp. There, code blocks which are different from the CPU to the GPU implementation are toggled via preprocessor conditionals.
 
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
index b139c99998526e0717f6ed3a7fb249e7d504a30d..a1f5f9874e6e3589193ff719c87f3292fbed02b4 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
@@ -60,10 +60,7 @@ with CodeGeneration() as ctx:
                                             velocity.center_vector,
                                             pdfs.center_vector)
 
-    if ctx.cuda:
-        target = ps.Target.GPU
-    else:
-        target = ps.Target.CPU
+    target = ps.Target.GPU if ctx.gpu else ps.Target.CPU
 
     #   LBM Sweep
     generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt
index 339f648197b4d716c9357f19b5f03ad4fdb43ffd..4b50efaa33346262db46751eced9b993a4eab78f 100644
--- a/apps/tutorials/codegen/CMakeLists.txt
+++ b/apps/tutorials/codegen/CMakeLists.txt
@@ -24,25 +24,18 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
                     DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython )
 
     #   Tutorial 3: Advanced lbmpy Code Generation
-    if(WALBERLA_BUILD_WITH_CUDA)
-        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
-            FILE 03_AdvancedLBMCodegen.py
-            OUT_FILES   CumulantMRTSweep.cu CumulantMRTSweep.h
-                        CumulantMRTPackInfo.cu CumulantMRTPackInfo.h
-                        InitialPDFsSetter.cu InitialPDFsSetter.h
-                        CumulantMRTNoSlip.cu CumulantMRTNoSlip.h)
-
+    walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+        FILE 03_AdvancedLBMCodegen.py
+        OUT_FILES   CumulantMRTSweep.${CODEGEN_FILE_SUFFIX} CumulantMRTSweep.h
+                    CumulantMRTPackInfo.${CODEGEN_FILE_SUFFIX} CumulantMRTPackInfo.h
+                    InitialPDFsSetter.${CODEGEN_FILE_SUFFIX} InitialPDFsSetter.h
+                    CumulantMRTNoSlip.${CODEGEN_FILE_SUFFIX} CumulantMRTNoSlip.h)
+
+    if(WALBERLA_BUILD_WITH_GPU_SUPPORT)
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
-                        FILES 03_AdvancedLBMCodegen.cpp
-                        DEPENDS blockforest cuda core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+                FILES 03_AdvancedLBMCodegen.cpp
+                DEPENDS blockforest gpu core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
     else()
-        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
-                FILE 03_AdvancedLBMCodegen.py
-                OUT_FILES   CumulantMRTSweep.cpp CumulantMRTSweep.h
-                CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h
-                InitialPDFsSetter.cpp InitialPDFsSetter.h
-                CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h)
-
         walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
                 FILES 03_AdvancedLBMCodegen.cpp
                 DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
deleted file mode 100644
index 518acb7e16f39b0136414fd52e9b480f4ee7b11d..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file 03_GameOfLife.cpp
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#include "01_GameOfLife_kernels.h"
-#include "cuda/HostFieldAllocator.h"
-#include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformDirectScheme.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-
-#include "core/Environment.h"
-
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/Kernel.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
-
-#include "field/AddToStorage.h"
-#include "field/communication/UniformMPIDatatypeInfo.h"
-#include "field/vtk/VTKWriter.h"
-
-#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
-#include "geometry/structured/GrayScaleImage.h"
-
-#include "gui/Gui.h"
-
-#include "stencil/D2Q9.h"
-
-#include "timeloop/SweepTimeloop.h"
-
-
-using namespace walberla;
-
-typedef GhostLayerField<double,1> ScalarField;
-typedef cuda::GPUField<double> GPUField;
-
-
-ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            double(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
-            );
-}
-
-class GameOfLifeSweepCUDA
-{
-   public:
-      GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
-         : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID )
-      {
-      }
-      void operator() ( IBlock * block )
-      {
-         auto srcCudaField = block->getData< cuda::GPUField<double> > ( gpuFieldSrcID_ );
-         auto dstCudaField = block->getData< cuda::GPUField<double> > ( gpuFieldDstID_ );
-
-         auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
-         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
-         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
-         myKernel();
-
-         srcCudaField->swapDataPointers( dstCudaField );
-      }
-   private:
-      BlockDataID gpuFieldSrcID_;
-      BlockDataID gpuFieldDstID_;
-};
-
-
-int main( int argc, char ** argv )
-{
-   walberla::Environment env( argc, argv );
-
-   geometry::GrayScaleImage image ("GosperGliderGun.png");
-
-   // Create blocks
-   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
-            uint_t(1) ,              uint_t(2),                           uint_t(1), // number of blocks in x,y,z direction
-            image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
-            real_t(1),                                                               // dx: length of one cell in physical coordinates
-            false,                                                                   // one block per process - "false" means all blocks to one process
-            false, false, false );                                                   // no periodicity
-
-
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-
-   // Initializing the field from an image
-   using geometry::initializer::ScalarFieldFromGrayScaleImage;
-   ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
-   fieldInitializer.init( image, uint_t(2), false );
-
-   BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
-   BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
-
-
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9 > CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
-   // Alternative, if CUDA enabled MPI is available
-   //blockforest::communication::UniformDirectScheme<stencil::D2Q9 >
-   //typedef field::communication::UniformMPIDatatypeInfo<GPUField> Packing
-
-   CommScheme commScheme(blocks);
-   commScheme.addDataToCommunicate( make_shared<Packing>(gpuFieldSrcID) );
-
-   // Create Timeloop
-   const uint_t numberOfTimesteps = uint_t(100); // number of timesteps for non-gui runs
-   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
-
-   // Registering the sweep
-   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
-                  << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
-
-   timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
-
-   // Register VTK output
-   timeloop.addFuncAfterTimeStep( field::createVTKOutput<ScalarField>( cpuFieldID, *blocks, "game_of_life" ) );
-   
-   // GUI output
-   GUI gui ( timeloop, blocks, argc, argv );
-   gui.run();
-
-   return 0;
-}
diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.dox b/apps/tutorials/cuda/01_GameOfLife_cuda.dox
deleted file mode 100644
index 7bbb50fe412080853588d0b1e0be0c8ed1a5e9a5..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.dox
+++ /dev/null
@@ -1,139 +0,0 @@
-namespace walberla{
-
-/**
-\page tutorial_cuda01 Tutorial - CUDA 1:  Game of Life on GPU
-
-
-\image html tutorial_cuda01_nvidia_titan.png
-
-> _Note:_ This tutorial required a CUDA aware MPI library.
-> If you get a SEGFAULT when executing this tutorial, make sure that your MPI library was built with
-> CUDA support! For instructions how to build OpenMPI with CUDA see this [page](https://www.open-mpi.org/faq/?category=building#build-cuda).
-
-\section cuda01_fields Creating Fields
-
-To run a simulation on a NVIDIA graphics card, we have to allocate data on the GPU and
-write a CUDA kernel that operates on this data. In this tutorial we first allocate a field on the GPU
-and learn about functionality to transfer data between CPU and GPU fields.
-
-Since initialization and output routines are usually not time critical, they are implemented
-for CPU fields only. In waLBerla we set up the complete simulation using
-CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
-end, copy everything back to do the output from the CPU field.
-So only the time critical kernels have to be written in CUDA.
-
-Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
-in a previous tutorial ( \ref tutorial_basics_03 ).
-One difference is, that fields which are often transfered from/to the GPU should be allocated with
-a different field allocator: cuda::HostFieldAllocator . This allocator uses cudaHostAlloc() instead of "new" ,
-such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
-Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
-mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
-
-\code
-ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            real_t(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
-            );
-}
-\endcode
-
-Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
-Then two GPU fields are created: "source" and "destination" field. The helper function
-cuda::addGPUFieldToStorage() creates a cuda::GPUField field of the same size and layout of the given
-CPU field:
-\code
-BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
-BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
-\endcode
-The contents of the new GPU fields are initialized with the contents of the given CPU field.
-
-
-
-\section cuda01_kernels Writing and calling CUDA kernels
-
-For a basic understanding of the CUDA support in waLBerla please read \ref cudaPage first.
-
-After reading this page you should know what a FieldAccessor is and how to call CUDA kernels from
-cpp files. So we can now start with writing
-a CUDA kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension.
-The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
-
-The kernel gets two field accessors as arguments, one for the source and one for the destination field.
-Both accessors have to be configured using the CUDA variables blockIdx and threadIdx, such that afterwards
-the get() and getNeighbor() functions of the accessor class can work correctly.
-\code
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
-{
-   src.set( blockIdx, threadIdx );
-   dst.set( blockIdx, threadIdx );
-   int liveNeighbors = 0;
-   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
-   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
-   // normal Game of Life algorithm ....
-   // ...
-}
-\endcode
-
-To call this kernel we write a thin wrapper sweep which only has to get the GPU fields out of the blockstorage
-and passes them to the CUDA kernel. We use the cuda::Kernel class from waLBerla here, so that we can write this
-sweep in a normal cpp file.
-Here are the contents of this sweep:
-\code
-auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
-auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
-
-auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
-myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
-myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
-myKernel();
-
-srcCudaField->swapDataPointers( dstCudaField );
-\endcode
-
-All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
-setup reasons.
-
-To see if our kernel works, we copy the contents back to the CPU field after every timestep:
-\code
-timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
-\endcode
-Of course this makes no sense for real simulations, since the transfer time is much higher than the
-time that was saved by doing the computation on the GPU. For production runs, one would usually transfer the
-field back every n'th timestep and write e.g. a VTK frame.
-
-
-\section cuda01_comm Communication
-
-In waLBerla there are two types of communication: _buffered_ and _direct_ communication.
-While buffered communication first collects all data in a buffer and sends only one message per communciation step and neighbor
-the direct communciation strategy, which is based on MPI datatypes, uses no intermediate buffers and therefore has to send
-more messages than buffered communication. For details see \ref walberla_communication .
-
-In the tutorials up to now, only the buffered approach was used. In this tutorial, we switch to the direct communciation strategy
-because then we can use the CUDA support of the MPI library to directly communciate from/to GPU memory.
-
-The usage of the two different communication schemes is very similar. Instead of creating a blockforest::communication::UniformBufferedScheme
-we create a blockforest::communication::UniformDirectScheme.
-Then we register a  field::communication::UniformMPIDatatypeInfo instead of the field::communication::PackInfo.
-
-\code
-typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
-CommScheme communication( blocks );
-communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
-\endcode
-
-This scheme also supports heterogeneous simulations, i.e. using a CPU field on
-some processes and a GPU field on other processes.
-
-*/
-
-
-}
diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.h b/apps/tutorials/cuda/01_GameOfLife_kernels.h
deleted file mode 100644
index 11f5eeba25800f18b9029092b510a9b36dfe1de0..0000000000000000000000000000000000000000
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <iostream>
-
-#include "cuda/FieldAccessor.h"
-
-
-namespace walberla {
-
-
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  );
-
-
-} // namespace walberla
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.cpp b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2cfc8b30b94e1bad57508d23f5a672de7ccc8df5
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.cpp
@@ -0,0 +1,117 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file 03_GameOfLife.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "01_GameOfLife_kernels.h"
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/MemcpyPackInfo.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
+#include "geometry/structured/GrayScaleImage.h"
+
+#include "stencil/D2Q9.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+
+using namespace walberla;
+
+using ScalarField = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
+using CommScheme = gpu::communication::UniformGPUScheme<stencil::D2Q9 > ;
+using Packing = gpu::communication::MemcpyPackInfo<GPUField> ;
+
+
+int main( int argc, char ** argv )
+{
+   walberla::Environment const env( argc, argv );
+
+   geometry::GrayScaleImage const image ("GosperGliderGun.png");
+
+   // Create blocks
+   shared_ptr< StructuredBlockForest > const blocks = blockforest::createUniformBlockGrid (
+            uint_t(1) ,              uint_t(2),                           uint_t(1), // number of blocks in x,y,z direction
+            image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
+            real_t(1),                                                               // dx: length of one cell in physical coordinates
+            false,                                                                   // one block per process - "false" means all blocks to one process
+            false, false, false );                                                   // no periodicity
+
+
+   auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >();
+   BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator);
+
+   // Initializing the field from an image
+   using geometry::initializer::ScalarFieldFromGrayScaleImage;
+   ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
+   fieldInitializer.init( image, uint_t(2), false );
+
+   BlockDataID const gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const gpuFieldDstID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+
+   const bool sendDirectlyFromGPU = false;
+   CommScheme commScheme(blocks, sendDirectlyFromGPU);
+   commScheme.addPackInfo( make_shared<Packing>(gpuFieldSrcID) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(101); // number of timesteps for non-gui runs
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme.getCommunicateFunctor(), "Communication" )
+                  << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
+
+   // VTK Writer every vtkWriteFrequency timesteps
+   const uint_t vtkWriteFrequency = 2;
+   if (vtkWriteFrequency > 0)
+   {
+      // Create a vtkOutput object with standard arguments
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency);
+
+      // Before the VTK output we need to sync the GPU data to the CPU memory
+      vtkOutput->addBeforeFunction(gpu::fieldCpyFunctor<ScalarField, GPUField >(blocks, cpuFieldID, gpuFieldDstID));
+
+      // Then create a dataWriter and write the output
+      auto dataWriter = make_shared< field::VTKWriter< ScalarField > >(cpuFieldID, "output");
+      vtkOutput->addCellDataWriter(dataWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   WcTimer simTimer;
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.start();
+   timeloop.run();
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   auto time      = real_c(simTimer.last());
+   WALBERLA_LOG_RESULT_ON_ROOT("Game of life tutorial finished. Elapsed time " << time)
+
+   return EXIT_SUCCESS;
+}
diff --git a/apps/tutorials/gpu/01_GameOfLife_cuda.dox b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
new file mode 100644
index 0000000000000000000000000000000000000000..8794e6c520ffb31d2c3653622cb2f4b4ba4b6eda
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_cuda.dox
@@ -0,0 +1,160 @@
+namespace walberla{
+
+/**
+\page tutorial_gpu01 Tutorial - GPU 1:  Game of Life on GPU
+
+\section gpu01_overview Overview
+
+In this tutorial, we will implement <a target="_blank" href="http://en.wikipedia.org/wiki/Conway%27s_Game_of_Life">Conway's Game of Life</a>,
+the algorithm which made cellular automata popular on graphics processing units (GPUs). This tutorial runs on NVIDIA GPUs with CUDA
+but can also run on AMD GPUs using HIP. waLBerla fully supports both libraries.
+For a basic understanding of the GPU support in waLBerla please read \ref gpuPage first.
+
+This tutorial is an extension of \ref tutorial_basics_03 to GPUs.
+
+\section gpu01_fields Creating Fields
+
+To run a simulation on a graphics processing unit (GPU), we have to allocate data on the GPU and
+write a kernel that operates on this data. In this tutorial we first allocate a field on the GPU
+and learn about functionality to transfer data between CPU and GPU fields.
+
+Since initialization and output routines are usually not time critical, they are implemented
+for CPU fields only. In waLBerla we set up the complete simulation using
+CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
+end, copy everything back to do the output from the CPU field.
+So only the time critical kernels have to be written for GPU.
+
+Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
+in a previous tutorial ( \ref tutorial_basics_03 ).
+One difference is, that fields which are often transferred from/to the GPU should be allocated with
+a different field allocator: gpu::HostFieldAllocator . This allocator uses gpuHostAlloc() instead of "new" ,
+such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
+Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
+mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
+
+\code
+auto hostFieldAllocator = make_shared< gpu::HostFieldAllocator<real_t> >();
+BlockDataID const cpuFieldID =field::addToStorage< ScalarField >(blocks, "CPU Field", real_c(0.0), field::fzyx, uint_c(1), hostFieldAllocator);
+\endcode
+
+Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
+Then two GPU fields are created: "source" and "destination" field. The helper function
+gpu::addGPUFieldToStorage() creates a gpu::GPUField field of the same size and layout of the given
+CPU field:
+\code
+BlockDataID gpuFieldSrcID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+BlockDataID gpuFieldDstID = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+\endcode
+The contents of the new GPU fields are initialized with the contents of the given CPU field.
+
+
+
+\section cuda01_kernels Writing and calling CUDA kernels
+
+After reading this page you should know what a FieldAccessor is and how to call GPU. So we can now start with writing
+a kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension (This is basically
+the only part that is different between CUDA and HIP).
+The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
+
+The kernel gets two field accessors as arguments, one for the source and one for the destination field.
+Both accessors have to be configured using the variables blockIdx and threadIdx from the CUDA or HIP library, such that afterwards
+the get() and getNeighbor() functions of the accessor class can work correctly.
+\code
+__global__ void gameOfLifeKernel( gpu::FieldAccessor<double> src, gpu::FieldAccessor<double> dst  )
+{
+   src.set( blockIdx, threadIdx );
+   dst.set( blockIdx, threadIdx );
+   int liveNeighbors = 0;
+   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   // normal Game of Life algorithm ....
+   // ...
+}
+\endcode
+
+To call this kernel we create a gpu::FieldIndexing object that receives a pointer to GPU fields. With this
+the blockDim and gridDim can be obtained as well as gpuAccess objects that contain the neighbouring information needed inside the GPU kernel.
+The kernel can be called normally with the three angle brackets.
+
+\code
+   auto srcCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldSrcID_ );
+   auto dstCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldDstID_ );
+
+   auto srcIndexing = gpu::FieldIndexing<real_t>::xyz( *srcCudaField );
+   auto dstIndexing = gpu::FieldIndexing<real_t>::xyz( *dstCudaField );
+
+   auto srcAccess = srcIndexing.gpuAccess();
+   auto dstAccess = dstIndexing.gpuAccess();
+
+   const dim3 gridDim = srcIndexing.gridDim();
+   const dim3 blockDim = srcIndexing.blockDim();
+
+   gameOfLifeKernel<<<gridDim, blockDim, 0, nullptr >>>(srcAccess, dstAccess );
+
+   srcCudaField->swapDataPointers( dstCudaField );
+\endcode
+
+All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
+setup reasons.
+
+\section gpu01_vtk VTK Output
+
+To see if our kernel works, we create a VTK writer. The VTK writer works on the CPU field. Thus it works exactly as in other
+examples. However, since our data is on GPU we need a `addBeforeFunction` that copies our data from host to device. This is done using the gpu::fieldCpyFunctor.
+Note that copying data is costly and thus we don't want to do this in every timestep usually. In this example it is only done every second timestep.
+
+\code
+   const uint_t vtkWriteFrequency = 2;
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency);
+      vtkOutput->addBeforeFunction(gpu::fieldCpyFunctor<ScalarField, GPUField >(blocks, cpuFieldID, gpuFieldDstID));
+
+      auto dataWriter = make_shared< field::VTKWriter< ScalarField > >(cpuFieldID, "output");
+      vtkOutput->addCellDataWriter(dataWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+\endcode
+
+\section gpu01_comm Communication
+
+For this tutorial we use the gpu::communication::UniformGPUScheme that first collects all data in a buffer and
+sends only one message per communication step and neighbor. For the PackInfo we use the MemcpyPackInfo. It receives
+a buffer located on the GPU and fills it using memcpy operations
+If the GPU library is build with MPI support this buffer can be send to other GPUs without a copy to the CPU.
+Otherwise the copying will be done in the back by the communication class.
+
+\code
+    using CommScheme = gpu::communication::UniformGPUScheme<stencil::D2Q9 > ;
+    using Packing = gpu::communication::MemcpyPackInfo<GPUField> ;
+    const bool sendDirectlyFromGPU = false;
+    CommScheme commScheme(blocks, sendDirectlyFromGPU);
+    commScheme.addPackInfo( make_shared<Packing>(gpuFieldSrcID) );
+\endcode
+
+\section gpu01_running Running the simulation
+
+To run the simulation we would like to point out a few common pitfalls to avoid. Basically it works very similar than the
+CPU equivalent. Since all Sweeps and Function calls are registered by the timeloop we can run the simulation using
+`timeloop.run();`. However, it is important to point out that kernel calls are asynchronous. Thus for time measurement purpose
+we need to make sure that all kernels are executed before stopping the timer. This can be done using `gpuDeviceSynchronize`.
+For good measure we also run this function right before starting the timer.
+
+\code
+   WcTimer simTimer;
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.start();
+   timeloop.run();
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   simTimer.end();
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+   auto time      = real_c(simTimer.last());
+   WALBERLA_LOG_RESULT_ON_ROOT("Game of life tutorial finished. Elapsed time " << time)
+\endcode
+
+\image html GameOfLifeGPU.png
+
+*/
+
+
+}
diff --git a/apps/tutorials/cuda/01_GameOfLife_kernels.cu b/apps/tutorials/gpu/01_GameOfLife_kernels.cu
similarity index 52%
rename from apps/tutorials/cuda/01_GameOfLife_kernels.cu
rename to apps/tutorials/gpu/01_GameOfLife_kernels.cu
index 399f705c82e29d3d62b6f2c2a6db7ffaee6639ff..47a54ea7c8edbccd796661141315209604f1dba0 100644
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.cu
+++ b/apps/tutorials/gpu/01_GameOfLife_kernels.cu
@@ -1,13 +1,10 @@
-#include "../cuda/01_GameOfLife_kernels.h"
-
-#include <iostream>
-
+#include "../gpu/01_GameOfLife_kernels.h"
 
 
 namespace walberla {
 
 
-__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
+__global__ void gameOfLifeKernel( gpu::FieldAccessor<real_t> src, gpu::FieldAccessor<real_t> dst  )
 {
    src.set( blockIdx, threadIdx );
    dst.set( blockIdx, threadIdx );
@@ -34,6 +31,25 @@ __global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAc
       dst.get() = src.get();
 }
 
+void GameOfLifeSweepCUDA::operator()(IBlock * block)
+{
+   auto srcCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldSrcID_ );
+   auto dstCudaField = block->getData< gpu::GPUField<real_t> > ( gpuFieldDstID_ );
+
+   auto srcIndexing = gpu::FieldIndexing<real_t>::xyz( *srcCudaField );
+   auto dstIndexing = gpu::FieldIndexing<real_t>::xyz( *dstCudaField );
+
+   auto srcAccess = srcIndexing.gpuAccess();
+   auto dstAccess = dstIndexing.gpuAccess();
+
+   const dim3 gridDim = srcIndexing.gridDim();
+   const dim3 blockDim = srcIndexing.blockDim();
+
+   gameOfLifeKernel<<<gridDim, blockDim, 0, nullptr >>>(srcAccess, dstAccess );
+
+   srcCudaField->swapDataPointers( dstCudaField );
+}
+
 
 
 
diff --git a/apps/tutorials/gpu/01_GameOfLife_kernels.h b/apps/tutorials/gpu/01_GameOfLife_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..663e3ac3f14d6e4ae3730d4fa3dda5600b0f5526
--- /dev/null
+++ b/apps/tutorials/gpu/01_GameOfLife_kernels.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "gpu/FieldIndexing.h"
+
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+namespace walberla {
+
+class GameOfLifeSweepCUDA
+{
+ public:
+   GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
+      : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID ){}
+
+   void operator() ( IBlock * block );
+
+ private:
+   BlockDataID gpuFieldSrcID_;
+   BlockDataID gpuFieldDstID_;
+};
+
+
+__global__ void gameOfLifeKernel(gpu::FieldAccessor<real_t> src, gpu::FieldAccessor<real_t> dst  );
+
+
+} // namespace walberla
diff --git a/apps/tutorials/cuda/CMakeLists.txt b/apps/tutorials/gpu/CMakeLists.txt
similarity index 67%
rename from apps/tutorials/cuda/CMakeLists.txt
rename to apps/tutorials/gpu/CMakeLists.txt
index efa4d2a554d84d69d4606594c4c2a50d4f65cf8e..14590ec05cc7b0dc8d8d7033fd9e1c3d3ffcce30 100644
--- a/apps/tutorials/cuda/CMakeLists.txt
+++ b/apps/tutorials/gpu/CMakeLists.txt
@@ -3,5 +3,5 @@ waLBerla_link_files_to_builddir( *.png )
 
 waLBerla_add_executable ( NAME 01_GameOfLife_cuda
                           FILES 01_GameOfLife_cuda.cpp 01_GameOfLife_kernels.cu
-                          DEPENDS blockforest core cuda field lbm geometry timeloop gui )                          
+                          DEPENDS blockforest core gpu field lbm geometry timeloop )
                                 
\ No newline at end of file
diff --git a/apps/tutorials/cuda/GosperGliderGun.png b/apps/tutorials/gpu/GosperGliderGun.png
similarity index 100%
rename from apps/tutorials/cuda/GosperGliderGun.png
rename to apps/tutorials/gpu/GosperGliderGun.png
diff --git a/apps/tutorials/lbm/01_BasicLBM.cpp b/apps/tutorials/lbm/01_BasicLBM.cpp
index ccfd0632d29ed34e7f43bb724500decb4f3145ff..6c1d920e9c046c0c9f0d93fd87422dc5c0406165 100644
--- a/apps/tutorials/lbm/01_BasicLBM.cpp
+++ b/apps/tutorials/lbm/01_BasicLBM.cpp
@@ -57,7 +57,7 @@ int main( int argc, char ** argv )
    const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
    const uint_t          timesteps       = parameters.getParameter< uint_t >         ( "timesteps",       uint_c( 10 )  );
 
-   const double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
+   const real_t remainingTimeLoggerFrequency = parameters.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) ); // in seconds
 
    // create fields
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::SRT( omega ) );
@@ -93,8 +93,9 @@ int main( int argc, char ** argv )
    timeloop.add() << Sweep( makeSharedSweep( lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldId, flagFieldId, fluidFlagUID ) ), "LB stream & collide" );
 
    // LBM stability check
+   auto checkFunction = [](PdfField_T::value_type value) {return math::finite( value );};
    timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >( walberlaEnv.config(), blocks, pdfFieldId,
-                                                                                                             flagFieldId, fluidFlagUID ) ),
+                                                                                                             flagFieldId, fluidFlagUID, checkFunction ) ),
                                   "LBM stability check" );
 
    // log remaining time
diff --git a/apps/tutorials/lbm/01_BasicLBM.dox b/apps/tutorials/lbm/01_BasicLBM.dox
index 733b6e51f4ae2d2e190b90cff68d2f257737f907..4bbeaaed4a96f5ffc3b1b47af29bcabe474462a4 100644
--- a/apps/tutorials/lbm/01_BasicLBM.dox
+++ b/apps/tutorials/lbm/01_BasicLBM.dox
@@ -261,10 +261,15 @@ The StabilityChecker instance can be controlled via the configuration file,
 for more information see \ref docStabilityChecker.
 Since field::makeStabilityChecker returns a shared pointer, we use makeSharedFunctor in order to wrap the shared pointer into an object
 that can be passed to the time loop.
+Note that NaNs are not defined if waLBerla is build using FASTMATH. For this case the field::StabilityChecker accepts
+a checkFunction as input argument. The checkFunction receives a value of the type the field::StabilityChecker is applied on
+and returns a bool. This function is applied on each value on each cell. If no checkFunction is provided a default is used
+which is exactly the one shown in the code below.
 
 \code
+auto checkFunction = [](PdfField_T::value_type value) {return math::finite( value );};
 timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
-                                     walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
+                               walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID, checkFunction ) ),
                                "LBM stability check" );
 \endcode
 
diff --git a/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp b/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
index b135758cd57b8214c40749b1b63a8c7dd0dc578b..3476c73d0e2b9613e09de513d7b33891aa832908 100644
--- a/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
+++ b/apps/tutorials/lbm/02_BasicLBM_ExemplaryExtensions.cpp
@@ -563,7 +563,7 @@ int main( int argc, char ** argv )
    const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
    const uint_t          timesteps       = parameters.getParameter< uint_t >         ( "timesteps",       uint_c( 10 )  );
 
-   const double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
+   const real_t remainingTimeLoggerFrequency = parameters.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) ); // in seconds
 
    // create lattice model
 
diff --git a/apps/tutorials/lbm/04_LBComplexGeometry.cpp b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
index dedc750f84b9354e9255365ba2da229aa1c3cdcb..6148efcc4ecde851cf742863b1299bd9a7247f84 100644
--- a/apps/tutorials/lbm/04_LBComplexGeometry.cpp
+++ b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
@@ -116,8 +116,8 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    //! [parseDomainParameters]
    // read domain parameters
diff --git a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
index ea41c7251223453cf4b52e8702b0ef10f33a233b..ae6f612cbb070298a16d0329d258d6256a217756 100644
--- a/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
+++ b/apps/tutorials/lbm/06_LBBoundaryCondition.cpp
@@ -383,13 +383,13 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    // create fields
-   LatticeModel_T latticeModel = LatticeModel_T(lbm::collision_model::SRT(omega));
-   BlockDataID pdfFieldID  = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, initialVelocity, real_t(1));
-   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", FieldGhostLayers);
+   LatticeModel_T const latticeModel = LatticeModel_T(lbm::collision_model::SRT(omega));
+   BlockDataID const pdfFieldID  = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, initialVelocity, real_t(1));
+   BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", FieldGhostLayers);
 
    // create and initialize boundary handling
 
@@ -409,11 +409,11 @@ int main(int argc, char** argv)
    setup.omega = omega;
 
    //! [timeTracker]
-   std::shared_ptr< lbm::TimeTracker > timeTracker = std::make_shared< lbm::TimeTracker >();
+   std::shared_ptr< lbm::TimeTracker > const timeTracker = std::make_shared< lbm::TimeTracker >();
    //! [timeTracker]
 
    //! [boundaryHandlingID]
-   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
+   BlockDataID const boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
       MyBoundaryHandling(flagFieldID, pdfFieldID, setup, timeTracker), "boundary handling");
    //! [boundaryHandlingID]
 
@@ -453,7 +453,7 @@ int main(int argc, char** argv)
 
    auto vtkConfig = walberlaEnv.config()->getBlock("VTK");
 
-   uint_t writeFrequency = vtkConfig.getBlock("fluid_field").getParameter< uint_t >("writeFrequency", uint_t(100));
+   uint_t const writeFrequency = vtkConfig.getBlock("fluid_field").getParameter< uint_t >("writeFrequency", uint_t(100));
 
    auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "fluid_field", writeFrequency, FieldGhostLayers, false,
                                                    "vtk_out", "simulation_step", false, true, true, false, 0);
diff --git a/apps/tutorials/pde/01_SolvingPDE.cpp b/apps/tutorials/pde/01_SolvingPDE.cpp
index 766f4f7c3e4b4f13f1b47db440dded2f5eaec429..f0f996817e60baecd5b9e1b4a8a932987c07c754 100644
--- a/apps/tutorials/pde/01_SolvingPDE.cpp
+++ b/apps/tutorials/pde/01_SolvingPDE.cpp
@@ -252,10 +252,10 @@ int main( int argc, char ** argv )
 
    // add fields with ghost layers to all blocks
    // source and destination fields for the unknowns u
-   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::zyxf, uint_c(1));
-   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::fzyx, uint_c(1));
+   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::fzyx, uint_c(1));
    // field to store the function f
-   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::fzyx, uint_c(1));
 
    // initialize the field
    initRHS( blocks, rhsID );
diff --git a/apps/tutorials/pde/02_HeatEquation.cpp b/apps/tutorials/pde/02_HeatEquation.cpp
index 1ff9d3632fc7d9ff880994109d38c094256b470e..398b9a66cf05835b2b0b9a7ac92cc433ca25696b 100644
--- a/apps/tutorials/pde/02_HeatEquation.cpp
+++ b/apps/tutorials/pde/02_HeatEquation.cpp
@@ -213,10 +213,10 @@ int main( int argc, char ** argv )
 
    // add fields with ghost layers to all blocks
    // source and destination fields for the unknowns u, required by the Jacobi method
-   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::zyxf, uint_c(1));
-   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::fzyx, uint_c(1));
+   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::fzyx, uint_c(1));
    // field to store the right-hand side of the equation
-   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::fzyx, uint_c(1));
 
    // set the field to the initial condition u(x,y,0)
    initU( blocks, srcID );
diff --git a/apps/tutorials/pde/03_HeatEquation_Extensions.cpp b/apps/tutorials/pde/03_HeatEquation_Extensions.cpp
index 037b3351bac4ffef07898ef21cb78525156067d1..e3b521447b5bcba08f18459d121acddbb02d4151 100644
--- a/apps/tutorials/pde/03_HeatEquation_Extensions.cpp
+++ b/apps/tutorials/pde/03_HeatEquation_Extensions.cpp
@@ -304,10 +304,10 @@ int main( int argc, char ** argv )
 
    // add fields with ghost layers to all blocks
    // source and destination fields for the unknowns u, required by the Jacobi method
-   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::zyxf, uint_c(1));
-   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID srcID = field::addToStorage< ScalarField >( blocks, "src", real_c(0), field::fzyx, uint_c(1));
+   BlockDataID dstID = field::addToStorage< ScalarField >( blocks, "dst", real_c(0), field::fzyx, uint_c(1));
    // field to store the right-hand side of the equation
-   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::zyxf, uint_c(1));
+   BlockDataID rhsID = field::addToStorage< ScalarField >( blocks, "rhs", real_c(0), field::fzyx, uint_c(1));
 
    // set the field to the initial condition u(x,y,0)
    initU( blocks, srcID );
diff --git a/cmake/FindParmetis.cmake b/cmake/FindParmetis.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..427e27673bdf352cf5ce81acd08d71639818564d
--- /dev/null
+++ b/cmake/FindParmetis.cmake
@@ -0,0 +1,9 @@
+find_path (PARMETIS_INCLUDE_DIR parmetis.h)
+find_library (PARMETIS_LIBRARY NAMES parmetis)
+
+# handle the QUIETLY and REQUIRED arguments and set PFFT_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (Parmetis DEFAULT_MSG PARMETIS_LIBRARY PARMETIS_INCLUDE_DIR)
+
+mark_as_advanced (PARMETIS_LIBRARY PARMETIS_INCLUDE_DIR)
\ No newline at end of file
diff --git a/cmake/waLBerlaHelperFunctions.cmake b/cmake/waLBerlaHelperFunctions.cmake
index c601d2d395d4335e4f66b3550d6e5174d74cffc4..efd2d0576eaa072dbfc8b3c278bb0ad15002eb70 100644
--- a/cmake/waLBerlaHelperFunctions.cmake
+++ b/cmake/waLBerlaHelperFunctions.cmake
@@ -67,6 +67,7 @@ function( waLBerla_generate_target_from_python )
             "\"CODEGEN_CFG\": \"${codegenCfg}\","
             "\"WALBERLA_BUILD_WITH_MPI\": \"${WALBERLA_BUILD_WITH_MPI}\","
             "\"WALBERLA_BUILD_WITH_CUDA\": \"${WALBERLA_BUILD_WITH_CUDA}\","
+            "\"WALBERLA_BUILD_WITH_HIP\": \"${WALBERLA_BUILD_WITH_HIP}\","
             "\"WALBERLA_BUILD_WITH_OPENMP\": \"${WALBERLA_BUILD_WITH_OPENMP}\" \\\}"
             )
     string(REPLACE "\"" "\\\"" cmakeVars ${cmakeVars})   # even one more quoting level required
diff --git a/doc/Mainpage.dox b/doc/Mainpage.dox
index 8ba191634cf8a090b6d053db0a449cf50b41189f..f6a7ed06a90bd02b87e66b69137478c0a67f4430 100644
--- a/doc/Mainpage.dox
+++ b/doc/Mainpage.dox
@@ -47,6 +47,14 @@ all the basic data strcutures and concepts of the framework.
 - \ref tutorial_lbm06 \n
   This tutorial deals with the usage of different LBM boundary conditions.
 
+\subsection advanced_topics Advanced Topics
+
+\subsection gpu GPU
+
+- \ref tutorial_gpu01 \n
+  A simple tutorial for Game of Life on GPU
+
+
 \subsection codegen Code Generation
 
 - \ref tutorial_codegen01 \n
diff --git a/doc/pics/GameOfLifeGPU.png b/doc/pics/GameOfLifeGPU.png
new file mode 100644
index 0000000000000000000000000000000000000000..769b250806abbab16fd74691e8d2a0e3154826e4
Binary files /dev/null and b/doc/pics/GameOfLifeGPU.png differ
diff --git a/doc/setup.dox b/doc/setup.dox
index ab9ba0588aa13b53cb631ed84d226f2c3a03d520..4afde5ea5edee04d21c6429ee606959eae0f886b 100644
--- a/doc/setup.dox
+++ b/doc/setup.dox
@@ -68,6 +68,8 @@ WALBERLA_BUILD_WITH_OPENMP         |    OFF   | Enables/Disables OpenMP support
 WALBERLA_BUILD_TESTS		       |	OFF   | If enabled, all tests are built when running make in the root build folder. But you can always go to a specific directory in your test folder and manually run make.
 WALBERLA_BUILD_BENCHMARKS          |	ON    | Enables/Disables the automatic build of all benchmarks located in "apps/benchmarks".  
 WALBERLA_BUILD_WITH_PYTHON         |	OFF   | Enables Python Support inside waLBerla (embedded Python). Then you can use Python scripts as configuration files and start an embedded python interpreter that can access waLBerla data structures. This builds a shared library (and python module) walberla_cpp.so in "apps/pythonmodule" so that you can use walberla from python.
+WALBERLA_BUILD_WITH_CUDA           |    OFF   | Enables/Disables support to run waLBerla on NVIDIA GPUs.
+WALBERLA_BUILD_WITH_HIP            |    OFF   | Enables/Disables support to run waLBerla on AMD GPUs.
 
 For a list of all switches, see CMakeLists.txt in the root source folder.
 
diff --git a/python/lbmpy_walberla/__init__.py b/python/lbmpy_walberla/__init__.py
index 15de37c8112e16a21476ba3e388adc843af92956..deb96e02ed3e5e5acfff016b7f185676788b7a76 100644
--- a/python/lbmpy_walberla/__init__.py
+++ b/python/lbmpy_walberla/__init__.py
@@ -1,8 +1,16 @@
 from .boundary import generate_boundary, generate_alternating_lbm_boundary
+from .boundary_collection import lbm_boundary_generator, generate_boundary_collection
 from .walberla_lbm_generation import RefinementScaling, generate_lattice_model
+from .storage_specification import generate_lbm_storage_specification
+from .sweep_collection import generate_lbm_sweep_collection
 from .packinfo import generate_lb_pack_info
+from .packing_kernels import generate_packing_kernels
 from .alternating_sweeps import generate_alternating_lbm_sweep
+from .walberla_lbm_package import generate_lbm_package
 
 __all__ = ['generate_lattice_model', 'generate_alternating_lbm_sweep',
-           'RefinementScaling', 'generate_boundary', 'generate_alternating_lbm_boundary',
-           'generate_lb_pack_info']
+           'generate_lbm_storage_specification', 'generate_lbm_sweep_collection',
+           'RefinementScaling', 'lbm_boundary_generator', 'generate_boundary_collection', 'generate_boundary',
+           'generate_alternating_lbm_boundary',
+           'generate_lb_pack_info', 'generate_packing_kernels',
+           'generate_lbm_package']
diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py
index dbcc1ab54e618101658a2c2262dac946f9d99805..444a2000adb65c3ad66bfc028f7bedcab4e60896 100644
--- a/python/lbmpy_walberla/alternating_sweeps.py
+++ b/python/lbmpy_walberla/alternating_sweeps.py
@@ -1,14 +1,17 @@
 from dataclasses import replace
+from typing import Set
 
 import numpy as np
 
-from pystencils_walberla.codegen import generate_selective_sweep, config_from_context
-from pystencils_walberla.kernel_selection import (
-    AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode)
 from pystencils import Target, TypedSymbol
 from lbmpy.creationfunctions import create_lb_ast
 from lbmpy.advanced_streaming import Timestep, is_inplace
 
+from pystencils_walberla.sweep import generate_selective_sweep
+from pystencils_walberla.kernel_selection import (
+    AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode)
+from pystencils_walberla.utility import config_from_context
+
 
 class EvenIntegerCondition(AbstractConditionNode):
     def __init__(self, parameter_name: str,
@@ -54,7 +57,7 @@ class TimestepTrackerMapping(AbstractInterfaceArgumentMapping):
         return f"{self.tracker_symbol.name}->getCounter()"
 
     @property
-    def headers(self):
+    def headers(self) -> Set:
         return {'"lbm/inplace_streaming/TimestepTracker.h"'}
 
 
diff --git a/python/lbmpy_walberla/boundary_collection.py b/python/lbmpy_walberla/boundary_collection.py
new file mode 100644
index 0000000000000000000000000000000000000000..17bfa245a3212404c35dd06c420fcb19a55c3049
--- /dev/null
+++ b/python/lbmpy_walberla/boundary_collection.py
@@ -0,0 +1,147 @@
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+import pystencils_walberla.boundary
+from lbmpy.boundaries.boundaryconditions import LbBoundary
+from lbmpy.boundaries.boundaryhandling import create_lattice_boltzmann_boundary_kernel
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from lbmpy.advanced_streaming import Timestep, is_inplace
+
+from pystencils_walberla.kernel_selection import KernelCallNode
+from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition, OddIntegerCondition, TimestepTrackerMapping
+from lbmpy_walberla.additional_data_handler import default_additional_data_handler
+
+from pystencils import Target
+
+import numpy as np
+
+
+def lbm_boundary_generator(class_name: str, flag_uid: str, boundary_object: LbBoundary, additional_data_handler=None):
+    def generation_function(ctx, lb_method, field_name='pdfs',
+                            streaming_pattern='pull', after_collision=True,
+                            namespace='lbm',
+                            **create_kernel_params):
+        context = __generate_alternating_lbm_boundary(generation_context=ctx,
+                                                      class_name=class_name,
+                                                      boundary_object=boundary_object,
+                                                      lb_method=lb_method,
+                                                      field_name=field_name,
+                                                      streaming_pattern=streaming_pattern,
+                                                      after_collision=after_collision,
+                                                      additional_data_handler=additional_data_handler,
+                                                      namespace=namespace,
+                                                      **create_kernel_params)
+
+        return context
+
+    return {'flag_id': flag_uid, 'generator': generation_function}
+
+
+def generate_boundary_collection(generation_context,
+                                 class_name,
+                                 boundary_generators,
+                                 lb_method,
+                                 field_name='pdfs',
+                                 streaming_pattern='pull',
+                                 prev_timestep=Timestep.BOTH,
+                                 namespace='lbm',
+                                 **create_kernel_params):
+
+    kernel_list = []
+    includes = []
+    boundary_classes = []
+    flag_uids = []
+    object_names = []
+    targets = []
+
+    for boundary_generator in boundary_generators:
+        boundary_functor = boundary_generator['generator']
+        context = boundary_functor(generation_context, lb_method, field_name, streaming_pattern, prev_timestep,
+                                   namespace, **create_kernel_params)
+
+        kernel_list.append(context['kernel'])
+        includes.append(f"\"{context['class_name']}.h\"")
+        boundary_classes.append(f"{context['namespace']}::{context['class_name']}")
+        flag_uids.append(boundary_generator['flag_id'])
+        object_names.append(f"{context['class_name']}Object")
+        targets.append(f"{context['target']}")
+
+    assert len(set(targets)) == 1
+    target = targets[0]
+
+    jinja_context = {
+        'kernel_list': kernel_list,
+        'class_name': class_name,
+        'target': target,
+        'namespace': namespace,
+        'includes': includes,
+        'boundary_classes': boundary_classes,
+        'flag_uids': flag_uids,
+        'object_names': object_names
+    }
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    env.globals.update(zip=zip)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template("BoundaryCollection.tmpl.h").render(**jinja_context)
+
+    generation_context.write_file(f"{class_name}.h", header)
+
+
+# Internal
+def __generate_alternating_lbm_boundary(generation_context,
+                                        class_name,
+                                        boundary_object,
+                                        lb_method,
+                                        field_name='pdfs',
+                                        streaming_pattern='pull',
+                                        after_collision=True,
+                                        additional_data_handler=None,
+                                        namespace='lbm',
+                                        **create_kernel_params):
+    if boundary_object.additional_data and additional_data_handler is None:
+        target = create_kernel_params.get('target', Target.CPU)
+        additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target)
+
+    timestep_param_name = 'timestep'
+    timestep_param_dtype = np.uint8
+
+    def boundary_creation_function(field, index_field, stencil, boundary_functor, target=Target.CPU, **kwargs):
+        pargs = (field, index_field, lb_method, boundary_functor)
+        kwargs = {'target': target, **kwargs}
+        ast_even = create_lattice_boltzmann_boundary_kernel(*pargs,
+                                                            streaming_pattern=streaming_pattern,
+                                                            prev_timestep=Timestep.EVEN,
+                                                            **kwargs)
+        ast_even.function_name = 'even'
+        kernel_even = KernelCallNode(ast_even)
+
+        if is_inplace(streaming_pattern):
+            ast_odd = create_lattice_boltzmann_boundary_kernel(*pargs,
+                                                               streaming_pattern=streaming_pattern,
+                                                               prev_timestep=Timestep.ODD,
+                                                               **kwargs)
+            ast_odd.function_name = 'odd'
+            kernel_odd = KernelCallNode(ast_odd)
+        else:
+            kernel_odd = kernel_even
+
+        if after_collision:
+            return EvenIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype)
+        else:
+            return OddIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype)
+
+    timestep_advancement = {"field_name": field_name, "function": "getTimestep"}
+
+    context = pystencils_walberla.boundary.generate_boundary(generation_context,
+                                                             class_name,
+                                                             boundary_object,
+                                                             field_name=field_name,
+                                                             neighbor_stencil=lb_method.stencil,
+                                                             index_shape=[lb_method.stencil.Q],
+                                                             kernel_creation_function=boundary_creation_function,
+                                                             namespace=namespace,
+                                                             additional_data_handler=additional_data_handler,
+                                                             field_timestep=timestep_advancement,
+                                                             **create_kernel_params)
+    return context
diff --git a/python/lbmpy_walberla/function_generator.py b/python/lbmpy_walberla/function_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3d552c2daa42bb12223150b65a7de0a59f8b3a
--- /dev/null
+++ b/python/lbmpy_walberla/function_generator.py
@@ -0,0 +1,26 @@
+from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec
+
+
+def kernel_family_function_generator(class_name: str, kernel_family: KernelFamily,
+                                     namespace: str = 'lbm', max_threads: int = None):
+
+    return lambda: __function_generator(class_name, kernel_family, namespace, max_threads)
+
+
+def __function_generator(class_name: str, kernel_family: KernelFamily,
+                         namespace: str = 'lbm', max_threads: int = None):
+
+    representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
+    representative_field = sorted(representative_field)[0]
+
+    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, ())
+
+    jinja_context = {
+        'kernel': kernel_family,
+        'namespace': namespace,
+        'function_name': class_name,
+        'field': representative_field,
+        'interface_spec': interface_spec,
+        'max_threads': max_threads
+    }
+    return jinja_context
diff --git a/python/lbmpy_walberla/packinfo.py b/python/lbmpy_walberla/packinfo.py
index 796ccfd9832b082610d5dc8b1065ddfa8450ca36..b53ef743f03c0f4c75128f0cce3d3fbaffbff593 100644
--- a/python/lbmpy_walberla/packinfo.py
+++ b/python/lbmpy_walberla/packinfo.py
@@ -6,7 +6,7 @@ from lbmpy.advanced_streaming.communication import _extend_dir
 from pystencils import Assignment, Field, Target
 from pystencils.stencil import inverse_direction
 
-from pystencils_walberla.codegen import comm_directions, generate_pack_info
+from pystencils_walberla.pack_info import _comm_directions, generate_pack_info
 
 
 def generate_lb_pack_info(generation_context,
@@ -65,7 +65,7 @@ def generate_lb_pack_info(generation_context,
             if all(offset == 0 for offset in fa.offsets):
                 continue
             comm_direction = inverse_direction(fa.offsets)
-            for comm_dir in comm_directions(comm_direction):
+            for comm_dir in _comm_directions(comm_direction):
                 common_spec[(comm_dir,)].add(fa.field.center(*fa.index))
 
     full_stencil = LBStencil(Stencil.D3Q27) if stencil.D == 3 else LBStencil(Stencil.D2Q9)
diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..985193f1434dd43d4294067a46ea7ba2ac01dbb3
--- /dev/null
+++ b/python/lbmpy_walberla/packing_kernels.py
@@ -0,0 +1,462 @@
+from dataclasses import replace
+from itertools import product
+
+import numpy as np
+import sympy as sp
+
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target
+from pystencils.stencil import offset_to_direction_string
+from pystencils.typing import TypedSymbol
+from pystencils.stencil import inverse_direction
+from pystencils.bit_masks import flag_cond
+
+from lbmpy.advanced_streaming import get_accessor, is_inplace, get_timesteps, Timestep
+from lbmpy.advanced_streaming.communication import _extend_dir
+from lbmpy.enums import Stencil
+from lbmpy.stencils import LBStencil
+
+from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.utility import config_from_context
+
+from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition
+from lbmpy_walberla.utility import timestep_suffix
+
+
+def generate_packing_kernels(generation_context, class_name: str, stencil: LBStencil, streaming_pattern: str = 'pull',
+                             namespace='lbm', nonuniform: bool = False,
+                             target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False,
+                             **create_kernel_params):
+
+    config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                 **create_kernel_params)
+
+    # Packing kernels should never be vectorised
+    config = replace(config, cpu_vectorize_info=None)
+
+    default_dtype = config.data_type.default_factory()
+    is_float = True if issubclass(default_dtype.numpy_dtype.type, np.float32) else False
+
+    cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config)
+
+    kernels = cg.create_uniform_kernel_families()
+
+    if nonuniform:
+        kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels)
+
+    jinja_context = {
+        'class_name': class_name,
+        'namespace': namespace,
+        'nonuniform': nonuniform,
+        'target': target.name.lower(),
+        'dtype': "float" if is_float else "double",
+        'is_gpu': target == Target.GPU,
+        'kernels': kernels,
+        'inplace': is_inplace(streaming_pattern),
+        'direction_sizes': cg.get_direction_sizes(),
+        'stencil_size': stencil.Q,
+        'dimension': stencil.D,
+        'src_field': cg.src_field,
+        'dst_field': cg.dst_field
+    }
+
+    if nonuniform:
+        jinja_context['mask_field'] = cg.mask_field
+
+    template_name = "NonuniformPackingKernels" if nonuniform else "PackingKernels"
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+    header = env.get_template(f"{template_name}.tmpl.h").render(**jinja_context)
+    source = env.get_template(f"{template_name}.tmpl.cpp").render(**jinja_context)
+
+    source_extension = "cpp" if target == Target.CPU else "cu"
+    generation_context.write_file(f"{class_name}.h", header)
+    generation_context.write_file(f"{class_name}.{source_extension}", source)
+
+
+#   ------------------------------ INTERNAL ----------------------------------------------------------------------------
+
+class PackingKernelsCodegen:
+
+    def __init__(self, stencil, streaming_pattern, class_name, config: CreateKernelConfig):
+        self.stencil = stencil
+        self.dim = stencil.D
+        self.values_per_cell = stencil.Q
+        self.full_stencil = LBStencil(Stencil.D3Q27) if self.dim == 3 else LBStencil(Stencil.D2Q9)
+        self.streaming_pattern = streaming_pattern
+        self.inplace = is_inplace(streaming_pattern)
+        self.class_name = class_name
+        self.config = config
+        self.data_type = config.data_type['pdfs'].numpy_dtype
+
+        self.src_field, self.dst_field = fields(
+            f'pdfs_src({self.values_per_cell}), pdfs_dst({self.values_per_cell}) :{self.data_type}[{self.dim}D]')
+        self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)]
+        self.mask_field = fields(f'mask : uint32 [{self.dim}D]')
+
+    def create_uniform_kernel_families(self, kernels_dict=None):
+        kernels = dict() if kernels_dict is None else kernels_dict
+
+        kernels['packAll'] = self.get_pack_all_kernel_family()
+        kernels['unpackAll'] = self.get_unpack_all_kernel_family()
+        kernels['localCopyAll'] = self.get_local_copy_all_kernel_family()
+
+        kernels['packDirection'] = self.get_pack_direction_kernel_family()
+        kernels['unpackDirection'] = self.get_unpack_direction_kernel_family()
+        kernels['localCopyDirection'] = self.get_local_copy_direction_kernel_family()
+        return kernels
+
+    def create_nonuniform_kernel_families(self, kernels_dict=None):
+        kernels = dict() if kernels_dict is None else kernels_dict
+        kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family()
+        kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family()
+        kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family()
+        kernels['unpackCoalescence'] = self.get_unpack_coalescence_kernel_family()
+
+        return kernels
+
+    # --------------------------- Pack / Unpack / LocalCopy All --------------------------------------------------------
+
+    def get_pack_all_ast(self, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        buffer = self._buffer(self.values_per_cell)
+        src, _ = self._stream_out_accs(timestep)
+        assignments = [Assignment(buffer(i), src[i]) for i in range(self.values_per_cell)]
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = 'pack_ALL' + timestep_suffix(timestep)
+        return ast
+
+    def get_pack_all_kernel_family(self):
+        if not self.inplace:
+            tree = KernelCallNode(self.get_pack_all_ast(Timestep.BOTH))
+        else:
+            even_call = KernelCallNode(self.get_pack_all_ast(Timestep.EVEN))
+            odd_call = KernelCallNode(self.get_pack_all_ast(Timestep.ODD))
+            tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
+        return KernelFamily(tree, self.class_name)
+
+    def get_unpack_all_ast(self, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        buffer = self._buffer(self.values_per_cell)
+        _, dst = self._stream_out_accs(timestep)
+        assignments = [Assignment(dst[i], buffer(i)) for i in range(self.values_per_cell)]
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = 'unpack_ALL' + timestep_suffix(timestep)
+        return ast
+
+    def get_unpack_all_kernel_family(self):
+        if not self.inplace:
+            tree = KernelCallNode(self.get_unpack_all_ast(Timestep.BOTH))
+        else:
+            even_call = KernelCallNode(self.get_unpack_all_ast(Timestep.EVEN))
+            odd_call = KernelCallNode(self.get_unpack_all_ast(Timestep.ODD))
+            tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
+        return KernelFamily(tree, self.class_name)
+
+    def get_local_copy_all_ast(self, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        src, dst = self._stream_out_accs(timestep)
+        assignments = [Assignment(dst[i], src[i]) for i in range(self.values_per_cell)]
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = 'localCopy_ALL' + timestep_suffix(timestep)
+        return ast
+
+    def get_local_copy_all_kernel_family(self):
+        if not self.inplace:
+            tree = KernelCallNode(self.get_local_copy_all_ast(Timestep.BOTH))
+        else:
+            even_call = KernelCallNode(self.get_local_copy_all_ast(Timestep.EVEN))
+            odd_call = KernelCallNode(self.get_local_copy_all_ast(Timestep.ODD))
+            tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
+        return KernelFamily(tree, self.class_name)
+
+    # --------------------------- Pack / Unpack / LocalCopy Direction --------------------------------------------------
+
+    def get_pack_direction_ast(self, comm_dir, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(comm_dir)
+        buffer = self._buffer(len(streaming_dirs))
+        src, _ = self._stream_out_accs(timestep)
+        assignments = []
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        for i, d in enumerate(dir_indices):
+            assignments.append(Assignment(buffer(i), src[d]))
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'pack_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_pack_direction_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_pack_direction_ast)
+
+    def get_unpack_direction_ast(self, comm_dir, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        buffer = self._buffer(len(streaming_dirs))
+        _, dst = self._stream_out_accs(timestep)
+        assignments = []
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        for i, d in enumerate(dir_indices):
+            assignments.append(Assignment(dst[d], buffer(i)))
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'unpack_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_unpack_direction_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_unpack_direction_ast)
+
+    def get_local_copy_direction_ast(self, comm_dir, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(comm_dir)
+        src, dst = self._stream_out_accs(timestep)
+        assignments = []
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        for direction in dir_indices:
+            assignments.append(Assignment(dst[direction], src[direction]))
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'localCopy_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_local_copy_direction_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_local_copy_direction_ast)
+
+    # --------------------------- Pack / Unpack / LocalCopy Coarse to Fine ---------------------------------------------
+
+    def get_unpack_redistribute_ast(self, comm_dir, timestep):
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        buffer = self._buffer(self.values_per_cell)
+        _, dst = self._stream_out_accs(timestep)
+        orthos = self.orthogonal_principals(comm_dir)
+        sub_dirs = self.contained_principals(comm_dir)
+        orthogonal_combinations = self.linear_combinations(orthos)
+        subdir_combinations = self.linear_combinations_nozero(sub_dirs)
+        second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)]
+        negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir])
+        assignments = []
+        for offset in orthogonal_combinations:
+            o = offset + negative_dir_correction
+            for d in range(self.values_per_cell):
+                field_acc = dst[d].get_shifted(*o)
+                assignments.append(Assignment(field_acc, buffer(d)))
+
+        for offset in second_gl_dirs:
+            o = offset + negative_dir_correction
+            for d in dir_indices:
+                field_acc = dst[d].get_shifted(*o)
+                assignments.append(Assignment(field_acc, buffer(d)))
+
+        function_name = f'unpackRedistribute_{dir_string}' + timestep_suffix(timestep)
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice,
+                                    data_type=self.data_type, ghost_layers=0, allow_double_writes=True,
+                                    cpu_openmp=self.config.cpu_openmp, target=self.config.target)
+
+        return create_kernel(assignments, config=config)
+
+    def get_unpack_redistribute_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast)
+
+    def get_local_copy_redistribute_ast(self, comm_dir, timestep):
+        #   TODO
+        raise NotImplementedError()
+
+    def get_local_copy_redistribute_kernel_family(self):
+        #   TODO
+        raise NotImplementedError()
+
+    # --------------------------- Pack / Unpack / LocalCopy Fine to Coarse ---------------------------------------------
+
+    def get_pack_partial_coalescence_ast(self, comm_dir, timestep):
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(comm_dir)
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        buffer = self._buffer(self.values_per_cell)
+        src, _ = self._stream_in_accs(timestep.next())
+        mask = self.mask_field
+
+        offsets = list(product(*((0, 1) for _ in comm_dir)))
+        assignments = []
+        for i, d in enumerate(dir_indices):
+            acc = 0
+            for o in offsets:
+                acc += flag_cond(d, mask[o], src[d].get_shifted(*o))
+            assignments.append(Assignment(buffer(i), acc))
+
+        iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
+        config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0)
+
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'packPartialCoalescence_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_pack_partial_coalescence_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast)
+
+    def get_unpack_coalescence_ast(self, comm_dir, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        assert not all(d == 0 for d in comm_dir)
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        buffer = self._buffer(self.values_per_cell)
+        _, dst = self._stream_in_accs(timestep.next())
+
+        coalescence_factor = sp.Rational(1, 2 ** self.dim)
+
+        assignments = []
+        for i, d in enumerate(dir_indices):
+            assignments.append(Assignment(dst[d], dst[d] + coalescence_factor * buffer(i)))
+
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'unpackCoalescence_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_unpack_coalescence_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_unpack_coalescence_ast)
+
+    def get_zero_coalescence_region_ast(self, comm_dir, timestep):
+        config = replace(self.config, ghost_layers=0)
+
+        dir_string = offset_to_direction_string(comm_dir)
+        streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
+        dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
+        if len(dir_indices) == 0:
+            return None
+        _, dst = self._stream_in_accs(timestep.next())
+
+        assignments = []
+        for i, d in enumerate(dir_indices):
+            assignments.append(Assignment(dst[d], 0.0))
+
+        ast = create_kernel(assignments, config=config)
+        ast.function_name = f'zeroCoalescenceRegion_{dir_string}' + timestep_suffix(timestep)
+        return ast
+
+    def get_zero_coalescence_region_kernel_family(self):
+        return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast)
+
+    #   TODO
+    def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep):
+        raise NotImplementedError()
+
+    def get_local_copy_partial_coalescence_kernel_family(self):
+        raise NotImplementedError()
+
+    # ------------------------------------------ Utility ---------------------------------------------------------------
+
+    def get_streaming_dirs(self, comm_dir):
+        if all(d == 0 for d in comm_dir):
+            return set()
+        else:
+            return set(_extend_dir(comm_dir)) & set(self.stencil)
+
+    def get_direction_sizes(self):
+        return [len(self.get_streaming_dirs(d)) for d in self.full_stencil]
+
+    def principal(self, i):
+        e_i = np.zeros(self.dim, dtype=int)
+        e_i[i] = 1
+        return e_i
+
+    def principals(self):
+        """Returns the principal directions for the given dimension"""
+        return tuple(self.principal(i) for i in range(self.dim))
+
+    def orthogonal_principals(self, comm_dir):
+        """Returns the positive principal directions orthogonal to the comm_dir"""
+        return tuple(p for i, p in enumerate(self.principals()) if comm_dir[i] == 0)
+
+    def contained_principals(self, comm_dir):
+        """Returns the (positive or negative) principal directions contained in comm_dir"""
+        vecs = []
+        for i, d in enumerate(comm_dir):
+            if d != 0:
+                vecs.append(d * self.principal(i))
+        return vecs
+
+    def linear_combinations(self, vectors):
+        if not vectors:
+            return [np.zeros(self.dim, dtype=int)]
+        else:
+            rest = self.linear_combinations(vectors[1:])
+            return rest + [vectors[0] + r for r in rest]
+
+    def linear_combinations_nozero(self, vectors):
+        if len(vectors) == 1:
+            return [vectors[0]]
+        else:
+            rest = self.linear_combinations_nozero(vectors[1:])
+            return rest + [vectors[0]] + [vectors[0] + r for r in rest]
+
+    # --------------------------- Private Members ----------------------------------------------------------------------
+
+    def _construct_directionwise_kernel_family(self, create_ast_callback):
+        subtrees = []
+        direction_symbol = TypedSymbol('dir', dtype='stencil::Direction')
+        for t in get_timesteps(self.streaming_pattern):
+            cases_dict = dict()
+            for comm_dir in self.full_stencil:
+                if all(d == 0 for d in comm_dir):
+                    continue
+                dir_string = offset_to_direction_string(comm_dir)
+                ast = create_ast_callback(comm_dir, t)
+                if ast is None:
+                    continue
+                kernel_call = KernelCallNode(ast)
+                cases_dict[f"stencil::{dir_string}"] = kernel_call
+            subtrees.append(SwitchNode(direction_symbol, cases_dict))
+
+        if not self.inplace:
+            tree = subtrees[0]
+        else:
+            tree = EvenIntegerCondition('timestep', subtrees[Timestep.EVEN.idx], subtrees[Timestep.ODD.idx],
+                                        parameter_dtype=np.uint8)
+        return KernelFamily(tree, self.class_name)
+
+    def _stream_out_accs(self, timestep):
+        accessor = self.accessors[timestep.idx]
+        src_stream_out_accs = accessor.write(self.src_field, self.stencil)
+        dst_stream_out_accs = accessor.write(self.dst_field, self.stencil)
+        return src_stream_out_accs, dst_stream_out_accs
+
+    def _stream_in_accs(self, timestep):
+        accessor = self.accessors[timestep.idx]
+        src_stream_in_accs = accessor.read(self.src_field, self.stencil)
+        dst_stream_in_accs = accessor.read(self.dst_field, self.stencil)
+        return src_stream_in_accs, dst_stream_in_accs
+
+    def _buffer(self, size):
+        return Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER,
+                                    dtype=self.data_type,
+                                    index_shape=(size,))
diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py
new file mode 100644
index 0000000000000000000000000000000000000000..de82603a022bb45c74db8cbadcb35eee724775ff
--- /dev/null
+++ b/python/lbmpy_walberla/storage_specification.py
@@ -0,0 +1,88 @@
+# import warnings
+
+from dataclasses import replace
+from jinja2 import Environment, PackageLoader, StrictUndefined
+import numpy as np
+
+from pystencils import Target
+
+from lbmpy import LBMConfig
+from lbmpy.advanced_streaming import is_inplace
+from lbmpy.methods import AbstractLbMethod
+
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.utility import config_from_context
+from lbmpy_walberla.packing_kernels import PackingKernelsCodegen
+
+
+def generate_lbm_storage_specification(generation_context, class_name: str,
+                                       method: AbstractLbMethod, lbm_config: LBMConfig, nonuniform: bool = False,
+                                       target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False,
+                                       **create_kernel_params):
+    namespace = "lbm"
+    stencil = method.stencil
+    stencil_name = stencil.name
+    streaming_pattern = lbm_config.streaming_pattern
+
+    config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                 **create_kernel_params)
+
+    # Packing kernels should never be vectorised
+    config = replace(config, cpu_vectorize_info=None)
+
+    default_dtype = config.data_type.default_factory()
+    is_float = True if issubclass(default_dtype.numpy_dtype.type, np.float32) else False
+    constant_suffix = "f" if is_float else ""
+
+    cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config)
+    kernels = cg.create_uniform_kernel_families()
+
+    if nonuniform:
+        kernels = cg.create_nonuniform_kernel_families(kernels_dict=kernels)
+
+    # Pure storage specification
+    if not stencil_name:
+        raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
+
+    communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27"
+
+    cqc = method.conserved_quantity_computation
+    equilibrium = method.equilibrium_distribution
+
+    jinja_context = {
+        'class_name': class_name,
+        'namespace': namespace,
+        'stencil_name': stencil_name,
+        'communication_stencil_name': communication_stencil_name,
+        'stencil_size': stencil.Q,
+        'dimension': stencil.D,
+        'compressible': cqc.compressible,
+        'equilibrium_accuracy_order': equilibrium.order,
+        'equilibrium_deviation_only': equilibrium.deviation_only,
+        'inplace': is_inplace(streaming_pattern),
+        'zero_centered': cqc.zero_centered_pdfs,
+        'weights': ",".join(str(w.evalf()) + constant_suffix for w in method.weights),
+        'inverse_weights': ",".join(str((1 / w).evalf()) + constant_suffix for w in method.weights),
+
+        'nonuniform': nonuniform,
+        'target': target.name.lower(),
+        'dtype': "float" if is_float else "double",
+        'is_gpu': target == Target.GPU,
+        'kernels': kernels,
+        'direction_sizes': cg.get_direction_sizes(),
+        'src_field': cg.src_field,
+        'dst_field': cg.dst_field
+
+    }
+    if nonuniform:
+        jinja_context['mask_field'] = cg.mask_field
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template('LbmStorageSpecification.tmpl.h').render(**jinja_context)
+    source = env.get_template('LbmStorageSpecification.tmpl.cpp').render(**jinja_context)
+
+    source_extension = "cpp" if target == Target.CPU else "cu"
+    generation_context.write_file(f"{class_name}.h", header)
+    generation_context.write_file(f"{class_name}.{source_extension}", source)
diff --git a/python/lbmpy_walberla/sweep_collection.py b/python/lbmpy_walberla/sweep_collection.py
new file mode 100644
index 0000000000000000000000000000000000000000..8edd0779b328de768cba4a3acb5f04bdb6bb3acf
--- /dev/null
+++ b/python/lbmpy_walberla/sweep_collection.py
@@ -0,0 +1,230 @@
+from dataclasses import replace
+from typing import Dict
+
+import sympy as sp
+import numpy as np
+
+from pystencils import Target, create_kernel
+from pystencils.config import CreateKernelConfig
+from pystencils.field import Field
+
+from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep
+from lbmpy.creationfunctions import LbmCollisionRule
+from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter, macroscopic_values_getter
+from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
+
+from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily
+from pystencils_walberla.utility import config_from_context
+from pystencils_walberla import generate_sweep_collection
+
+from .alternating_sweeps import EvenIntegerCondition
+from .function_generator import kernel_family_function_generator
+
+
+def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmCollisionRule,
+                                  streaming_pattern='pull',
+                                  field_layout='fzyx', refinement_scaling=None,
+                                  macroscopic_fields: Dict[str, Field] = None,
+                                  target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                                  max_threads=None,
+                                  **create_kernel_params):
+    config = config_from_context(ctx, target=target, data_type=data_type,
+                                 cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params)
+
+    # usually a numpy layout is chosen by default i.e. xyzf - which is bad for waLBerla where at least the spatial
+    # coordinates should be ordered in reverse direction i.e. zyx
+    lb_method = collision_rule.method
+
+    q = lb_method.stencil.Q
+    dim = lb_method.stencil.D
+
+    if field_layout == 'fzyx':
+        config.cpu_vectorize_info['assume_inner_stride_one'] = True
+    elif field_layout == 'zyxf':
+        config.cpu_vectorize_info['assume_inner_stride_one'] = False
+
+    src_field = Field.create_generic('pdfs', dim, config.data_type['pdfs'].numpy_dtype,
+                                     index_dimensions=1, layout=field_layout, index_shape=(q,))
+    if is_inplace(streaming_pattern):
+        dst_field = src_field
+    else:
+        dst_field = Field.create_generic('pdfs_tmp', dim, config.data_type['pdfs_tmp'].numpy_dtype,
+                                         index_dimensions=1, layout=field_layout,
+                                         index_shape=(q,))
+
+    config = replace(config, ghost_layers=0)
+
+    function_generators = []
+
+    def family(name):
+        return lbm_kernel_family(class_name, name, collision_rule, streaming_pattern, src_field, dst_field, config)
+
+    def generator(name, kernel_family):
+        return kernel_family_function_generator(name, kernel_family, namespace='lbm', max_threads=max_threads)
+
+    function_generators.append(generator('streamCollide', family("streamCollide")))
+    function_generators.append(generator('collide', family("collide")))
+    function_generators.append(generator('stream', family("stream")))
+    function_generators.append(generator('streamOnlyNoAdvancement', family("streamOnlyNoAdvancement")))
+
+    setter_family = get_setter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields, config)
+    setter_generator = kernel_family_function_generator('initialise', setter_family,
+                                                        namespace='lbm', max_threads=max_threads)
+    function_generators.append(setter_generator)
+
+    getter_family = get_getter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields, config)
+    getter_generator = kernel_family_function_generator('calculateMacroscopicParameters', getter_family,
+                                                        namespace='lbm', max_threads=max_threads)
+    function_generators.append(getter_generator)
+
+    generate_sweep_collection(ctx, class_name, function_generators, refinement_scaling)
+
+
+class RefinementScaling:
+    def __init__(self):
+        self.scaling_info = []
+
+    def add_standard_relaxation_rate_scaling(self, viscosity_relaxation_rate):
+        self.add_scaling(viscosity_relaxation_rate)
+
+    def add_scaling(self, parameter):
+        if isinstance(parameter, sp.Symbol):
+            self.scaling_info.append(parameter.name)
+        else:
+            raise ValueError("Only pure symbols allowed")
+
+
+def lbm_kernel_family(class_name, kernel_name,
+                      collision_rule, streaming_pattern, src_field, dst_field, config: CreateKernelConfig):
+
+    if kernel_name == "streamCollide":
+        def lbm_kernel(field_accessor, lb_stencil):
+            return create_lbm_kernel(collision_rule, src_field, dst_field, field_accessor)
+        advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"}
+        temporary_fields = ['pdfs_tmp']
+        field_swaps = [('pdfs', 'pdfs_tmp')]
+    elif kernel_name == "collide":
+        def lbm_kernel(field_accessor, lb_stencil):
+            return create_lbm_kernel(collision_rule, src_field, dst_field, CollideOnlyInplaceAccessor())
+        advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"}
+        temporary_fields = ()
+        field_swaps = ()
+    elif kernel_name == "stream":
+        def lbm_kernel(field_accessor, lb_stencil):
+            return create_stream_only_kernel(lb_stencil, src_field, dst_field, field_accessor)
+        advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"}
+        temporary_fields = ['pdfs_tmp']
+        field_swaps = [('pdfs', 'pdfs_tmp')]
+    elif kernel_name == "streamOnlyNoAdvancement":
+        def lbm_kernel(field_accessor, lb_stencil):
+            return create_stream_only_kernel(lb_stencil, src_field, dst_field, field_accessor)
+        advance_timestep = {"field_name": src_field.name, "function": "getTimestepPlusOne"}
+        temporary_fields = ['pdfs_tmp']
+        field_swaps = ()
+    else:
+        raise ValueError(f"kernel name: {kernel_name} is not valid")
+
+    lb_method = collision_rule.method
+    stencil = lb_method.stencil
+
+    if is_inplace(streaming_pattern):
+        nodes = list()
+        for timestep in [Timestep.EVEN, Timestep.ODD]:
+            accessor = get_accessor(streaming_pattern, timestep)
+            timestep_suffix = str(timestep)
+
+            update_rule = lbm_kernel(accessor, stencil)
+            ast = create_kernel(update_rule, config=config)
+            ast.function_name = 'kernel_' + kernel_name + timestep_suffix
+            ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one']
+            nodes.append(KernelCallNode(ast))
+
+        tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8)
+        family = KernelFamily(tree, class_name, field_timestep=advance_timestep)
+    else:
+        timestep = Timestep.BOTH
+        accessor = get_accessor(streaming_pattern, timestep)
+
+        update_rule = lbm_kernel(accessor, stencil)
+        ast = create_kernel(update_rule, config=config)
+        ast.function_name = 'kernel_' + kernel_name
+        ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one']
+        node = KernelCallNode(ast)
+        family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps)
+
+    return family
+
+
+def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig):
+    dim = lb_method.stencil.D
+    density = macroscopic_fields.get('density', 1.0)
+    velocity = macroscopic_fields.get('velocity', [0.0] * dim)
+
+    get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
+    temporary_fields = ()
+    field_swaps = ()
+
+    if is_inplace(streaming_pattern):
+        nodes = list()
+        for timestep in [Timestep.EVEN, Timestep.ODD]:
+            timestep_suffix = str(timestep)
+            setter = macroscopic_values_setter(lb_method,
+                                               density=density, velocity=velocity, pdfs=pdfs,
+                                               streaming_pattern=streaming_pattern, previous_timestep=timestep)
+
+            setter_ast = create_kernel(setter, config=config)
+            setter_ast.function_name = 'kernel_initialise' + timestep_suffix
+            nodes.append(KernelCallNode(setter_ast))
+        tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8)
+        family = KernelFamily(tree, class_name, field_timestep=get_timestep)
+    else:
+        timestep = Timestep.BOTH
+        setter = macroscopic_values_setter(lb_method,
+                                           density=density, velocity=velocity, pdfs=pdfs,
+                                           streaming_pattern=streaming_pattern, previous_timestep=timestep)
+
+        setter_ast = create_kernel(setter, config=config)
+        setter_ast.function_name = 'kernel_initialise'
+        node = KernelCallNode(setter_ast)
+        family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps)
+
+    return family
+
+
+def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig):
+    density = macroscopic_fields.get('density', None)
+    velocity = macroscopic_fields.get('velocity', None)
+
+    if density is None and velocity is None:
+        return None
+
+    get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
+    temporary_fields = ()
+    field_swaps = ()
+
+    if is_inplace(streaming_pattern):
+        nodes = list()
+        for timestep in [Timestep.EVEN, Timestep.ODD]:
+            timestep_suffix = str(timestep)
+            getter = macroscopic_values_getter(lb_method,
+                                               density=density, velocity=velocity, pdfs=pdfs,
+                                               streaming_pattern=streaming_pattern, previous_timestep=timestep)
+
+            getter_ast = create_kernel(getter, config=config)
+            getter_ast.function_name = 'kernel_getter' + timestep_suffix
+            nodes.append(KernelCallNode(getter_ast))
+        tree = EvenIntegerCondition('timestep', nodes[0], nodes[1], parameter_dtype=np.uint8)
+        family = KernelFamily(tree, class_name, field_timestep=get_timestep)
+    else:
+        timestep = Timestep.BOTH
+        getter = macroscopic_values_getter(lb_method,
+                                           density=density, velocity=velocity, pdfs=pdfs,
+                                           streaming_pattern=streaming_pattern, previous_timestep=timestep)
+
+        getter_ast = create_kernel(getter, config=config)
+        getter_ast.function_name = 'kernel_getter'
+        node = KernelCallNode(getter_ast)
+        family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps)
+
+    return family
diff --git a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f49137846ba99d60888e7353ac4ff195ade2a84
--- /dev/null
+++ b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
@@ -0,0 +1,108 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+
+{% if target is equalto 'gpu' -%}
+#include "gpu/GPUWrapper.h"
+{%- endif %}
+
+{% for include in includes -%}
+#include {{include}}
+{% endfor %}
+
+
+namespace walberla{
+namespace {{namespace}} {
+
+template <typename FlagField_T>
+class {{class_name}}
+{
+ public:
+   enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+
+   {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])]] | type_identifier_list -}} )
+      : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
+   {
+      {% for object_name, boundary_class, kernel in zip(object_names, boundary_classes, kernel_list) -%}
+
+      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'])]] | type_identifier_list -}});
+      {% endfor %}
+
+      {% for object_name, flag_uid in zip(object_names, flag_uids) -%}
+      {{object_name}}->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("{{flag_uid}}"), domainUID);
+      {% endfor %}
+   }
+
+   void run ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {% for object_name in object_names -%}
+      {{object_name}}->run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+      {% endfor %}
+   }
+
+   void inner ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {% for object_name in object_names -%}
+      {{object_name}}->inner({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+      {% endfor %}
+   }
+
+   void outer ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {% for object_name in object_names -%}
+      {{object_name}}->outer({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+      {% endfor %}
+   }
+
+   void operator() ({{- ["IBlock * block", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
+   }
+
+   std::function<void (IBlock *)> getSweep({{- ["Type type = Type::ALL", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      switch (type)
+      {
+      case Type::INNER:
+         return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->inner({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->outer({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](IBlock* block) { this->run({{- ["block", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}); };
+      }
+   }
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+   BlockDataID flagID;
+   BlockDataID pdfsID;
+   walberla::FlagUID domainUID;
+
+   {% for object_name, boundary_class in zip(object_names, boundary_classes) -%}
+   shared_ptr<{{boundary_class}}> {{object_name}};
+   {% endfor %}
+};
+
+}
+}
+
diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
index dd50337e1714c8abcf49c35b0d77e2e23d4d9c29..17d5bdeb4b5e7443958f9619d08848ae817b9a89 100644
--- a/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
+++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
@@ -13,7 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \\author Martin Bauer <martin.bauer@fau.de>
+//! \\file {{class_name}}.cpp
+//! \\author lbmpy
 //======================================================================================================================
 
 #include <cmath>
diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.h b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
index 677be50025122939d50b10eb7d1381afe519eb4e..5631eec3250d2c1e99d9a59e268e5e1794520757 100644
--- a/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
+++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
@@ -13,8 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \\author Martin Bauer <martin.bauer@fau.de>
-//
+//! \\file {{class_name}}.h
+//! \\author lbmpy
 //======================================================================================================================
 
 #pragma once
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91c7d7d960a78552628d3d8568dd611f13c14a2d
--- /dev/null
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
@@ -0,0 +1,180 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author lbmpy
+//======================================================================================================================
+
+#include "{{class_name}}.h"
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+/*************************************************************************************
+ *                                Kernel Definitions
+*************************************************************************************/
+{{ kernels['packAll']      | generate_definitions }}
+{{ kernels['unpackAll']    | generate_definitions }}
+{{ kernels['localCopyAll'] | generate_definitions }}
+
+{{ kernels['packDirection']      | generate_definitions }}
+{{ kernels['unpackDirection']    | generate_definitions }}
+{{ kernels['localCopyDirection'] | generate_definitions }}
+
+{% if nonuniform -%}
+{{ kernels['unpackRedistribute']    | generate_definitions }}
+{{ kernels['packPartialCoalescence']    | generate_definitions }}
+{{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
+{{ kernels['unpackCoalescence']    | generate_definitions }}
+{%- endif %}
+
+/*************************************************************************************
+ *                                 Kernel Wrappers
+*************************************************************************************/
+
+namespace walberla {
+namespace {{namespace}} {
+
+   void {{class_name}}::PackKernels::packAll(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+             "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+      {{kernels['packAll'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+
+   void {{class_name}}::PackKernels::unpackAll(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+             "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+      {{kernels['unpackAll'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+
+   void {{class_name}}::PackKernels::localCopyAll(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+             kernels['localCopyAll'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      {{kernels['localCopyAll']
+               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::packDirection(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+             "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+      {{kernels['packDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::unpackDirection(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+             "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+      {{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::localCopyDirection(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+             "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+             kernels['localCopyDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      {{kernels['localCopyDirection']
+          | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+          | indent(6) }}
+   }
+
+   {% if nonuniform -%}
+   void {{class_name}}::PackKernels::unpackRedistribute(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+             "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+      {{kernels['unpackRedistribute'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::packPartialCoalescence(
+      {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci",
+             "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+      {{kernels['packPartialCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::zeroCoalescenceRegion(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+             kernels['zeroCoalescenceRegion'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{kernels['zeroCoalescenceRegion'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+
+   void {{class_name}}::PackKernels::unpackCoalescence(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+             "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters,
+             ["gpuStream_t stream"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const
+   {
+      {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+      {{kernels['unpackCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
+   }
+   {%- endif %}
+}  // namespace {{namespace}}
+}  // namespace walberla
\ No newline at end of file
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d5409b6684703666d5066ffa9ea40530a48f07d
--- /dev/null
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
@@ -0,0 +1,256 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/mpi/SendBuffer.h"
+#include "core/mpi/RecvBuffer.h"
+
+#include "domain_decomposition/IBlock.h"
+#include "field/GhostLayerField.h"
+
+#include "stencil/{{stencil_name}}.h"
+#include "stencil/Directions.h"
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+#include "gpu/GPUWrapper.h"
+#include "gpu/GPUField.h"
+{%- endif %}
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla
+{
+namespace {{namespace}}{
+
+class {{class_name}}
+{
+ public:
+   // Used lattice stencil
+   using Stencil = stencil::{{stencil_name}};
+   // Lattice stencil used for the communication (should be used to define which block directions need to be communicated)
+   using CommunicationStencil = stencil::{{communication_stencil_name}};
+   // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997
+   static const bool compressible = {% if compressible %}true{% else %}false{% endif %};
+   // Cut off for the lattice Boltzmann equilibrium
+   static const int equilibriumAccuracyOrder = {{equilibrium_accuracy_order}};
+   // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho"
+   static const bool equilibriumDeviationOnly = {% if equilibrium_deviation_only -%} true {%- else -%} false {%- endif -%};
+   // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push)
+   static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%};
+   // If true the background deviation (rho_0 = 1) is subtracted for the collision step.
+   static const bool zeroCenteredPDFs = {% if zero_centered -%} true {%- else -%} false {%- endif -%};
+   // Lattice weights
+   static constexpr {{dtype}} w[{{stencil_size}}] = { {{weights}} };
+   // Inverse lattice weights
+   static constexpr {{dtype}} wInv[{{stencil_size}}] = { {{inverse_weights}} };
+
+   // Compute kernels to pack and unpack MPI buffers
+   class PackKernels {
+
+    public:
+      using PdfField_T = {{src_field | field_type(is_gpu=is_gpu)}};
+      using value_type = typename PdfField_T::value_type;
+
+      {% if nonuniform -%}
+      {% if target is equalto 'cpu' -%}
+      using MaskField_T = GhostLayerField< uint32_t, 1 >;
+      {%- elif target is equalto 'gpu' -%}
+      using MaskField_T = gpu::GPUField< uint32_t >;
+      {%- endif %}
+      {%- endif %}
+
+      static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%};
+
+      /**
+       * Packs all pdfs from the given cell interval to the send buffer.
+       * */
+      void packAll(
+         {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+                "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Unpacks all pdfs from the send buffer to the given cell interval.
+       * */
+      void unpackAll(
+         {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+                "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Copies data between two blocks on the same process.
+       * All pdfs from the sending interval are copied onto the receiving interval.
+       * */
+      void localCopyAll(
+         {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+                kernels['localCopyAll'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed.
+       * */
+      void packDirection(
+         {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+                "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked.
+       * */
+      void unpackDirection(
+         {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+                "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /** Copies data between two blocks on the same process.
+        * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval.
+        * */
+      void localCopyDirection(
+         {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+                "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+                kernels['localCopyDirection'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packDirection / unpackDirection
+       * @param ci  The cell interval
+       * @param dir The communication direction
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci, stencil::Direction dir) const {
+         return ci.numCells() * sizes[dir] * sizeof(value_type);
+      }
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packAll / unpackAll
+       * @param ci  The cell interval
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci) const {
+         return ci.numCells() * {{stencil_size}} * sizeof(value_type);
+      }
+
+      {% if nonuniform -%}
+
+      /**
+       * Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid.
+       * */
+      void unpackRedistribute(
+         {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+                "unsigned char * inBuffer", kernels['unpackRedistribute'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Partially coalesces and packs populations streaming from a fine block into a coarse block
+       * */
+      void packPartialCoalescence(
+         {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci",
+                "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Prepares a coarse block for coalescence by setting every population that must be coalesced from fine blocks to zero.
+       * */
+      void zeroCoalescenceRegion(
+         {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+                kernels['zeroCoalescenceRegion'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Unpacks and coalesces populations coming from a fine block onto the fine grid
+       * */
+      void unpackCoalescence(
+         {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+                "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters,
+                ["gpuStream_t stream = nullptr"] if is_gpu else []]
+             | type_identifier_list -}}
+      ) const;
+
+      /**
+       * Returns the number of bytes that will be unpacked to the cell interval
+       * when using unpackRedistribute. This is 2^{-d} of the data that would be
+       * unpacked during same-level communication.
+       * @param ci  The cell interval
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t redistributeSize(CellInterval & ci) const {
+         return size(ci) >> {{dimension}};
+      }
+
+      /**
+       * Returns the number of bytes that will be packed from the cell interval
+       * when using packPartialCoalescence.
+       * @param ci  The cell interval
+       * @param dir The communication direction
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t partialCoalescenceSize(CellInterval & ci, stencil::Direction dir) const {
+         return size(ci, dir) >> {{dimension}};
+      }
+
+      {%- endif %}
+
+    private:
+      const uint_t sizes[{{direction_sizes|length}}] { {{ direction_sizes | join(', ') }} };
+   };
+
+};
+
+}} //{{namespace}}/walberla
\ No newline at end of file
diff --git a/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..558dc1faa53a56c281b4cf106f7eb643d0ae9dae
--- /dev/null
+++ b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.cpp
@@ -0,0 +1,54 @@
+{% extends "PackingKernels.tmpl.cpp" %}
+
+{% block AdditionalKernelDefinitions %}
+{{ kernels['unpackRedistribute']    | generate_definitions }}
+{{ kernels['packPartialCoalescence']    | generate_definitions }}
+{{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
+{{ kernels['unpackCoalescence']    | generate_definitions }}
+{% endblock %}
+
+{% block AdditionalDefinitions %}
+
+void {{class_name}}::unpackRedistribute(
+   {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+   "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+       ["gpuStream_t stream"] if is_gpu else []]
+   | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+
+   {{kernels['unpackRedistribute'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+void {{class_name}}::packPartialCoalescence(
+   {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci",
+   "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters,
+       ["gpuStream_t stream"] if is_gpu else []]
+   | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+
+   {{kernels['packPartialCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+void {{class_name}}::zeroCoalescenceRegion(
+   {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+   kernels['zeroCoalescenceRegion'].kernel_selection_parameters,
+       ["gpuStream_t stream"] if is_gpu else []]
+   | type_identifier_list -}}
+) const {
+   {{kernels['zeroCoalescenceRegion'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+void {{class_name}}::unpackCoalescence(
+   {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+   "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters,
+       ["gpuStream_t stream"] if is_gpu else []]
+   | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+
+   {{kernels['unpackCoalescence'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+{% endblock %}
diff --git a/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..01c99a69577f7928d830d743f9d0aba0b8584ea9
--- /dev/null
+++ b/python/lbmpy_walberla/templates/NonuniformPackingKernels.tmpl.h
@@ -0,0 +1,74 @@
+{% extends "PackingKernels.tmpl.h" %}
+
+{% block AdditionalPublicDeclarations %}
+{% if target is equalto 'cpu' -%}
+   using MaskField_T = GhostLayerField< uint32_t, 1 >;
+{%- elif target is equalto 'gpu' -%}
+   using MaskField_T = gpu::GPUField< uint32_t >;
+{%- endif %}
+
+
+   /**
+   * Unpacks and uniformly redistributes populations coming from a coarse block onto
+   * the fine grid.
+   */
+   void unpackRedistribute(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+      "unsigned char * inBuffer", kernels['unpackRedistribute'].kernel_selection_parameters,
+       ["gpuStream_t stream = nullptr"] if is_gpu else []]
+      | type_identifier_list -}}
+   ) const;
+
+   /**
+   * Partially coalesces and packs populations streaming from a fine block into a coarse block
+   */
+   void packPartialCoalescence(
+      {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & ci",
+      "unsigned char * outBuffer", kernels['packPartialCoalescence'].kernel_selection_parameters,
+          ["gpuStream_t stream = nullptr"] if is_gpu else []]
+      | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Prepares a coarse block for coalescence by setting every population that must be coalesced from fine blocks
+    * to zero.
+    */
+   void zeroCoalescenceRegion(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+      kernels['zeroCoalescenceRegion'].kernel_selection_parameters,
+          ["gpuStream_t stream = nullptr"] if is_gpu else []]
+      | type_identifier_list -}}
+   ) const;
+
+   /**
+   * Unpacks and coalesces populations coming from a fine block onto the fine grid
+   */
+   void unpackCoalescence(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+      "unsigned char * inBuffer", kernels['unpackCoalescence'].kernel_selection_parameters,
+          ["gpuStream_t stream = nullptr"] if is_gpu else []]
+      | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Returns the number of bytes that will be unpacked to the cell interval
+    * when using unpackRedistribute. This is 2^{-d} of the data that would be
+    * unpacked during same-level communication.
+    * @param ci  The cell interval
+    * @return    The required size of the buffer, in bytes
+    */
+   uint_t redistributeSize(CellInterval & ci) const {
+      return size(ci) >> {{dimension}};
+   }
+
+   /**
+    * Returns the number of bytes that will be packed from the cell interval
+    * when using packPartialCoalescence.
+    * @param ci  The cell interval
+    * @param dir The communication direction
+    * @return    The required size of the buffer, in bytes
+    */
+   uint_t partialCoalescenceSize(CellInterval & ci, stencil::Direction dir) const {
+      return size(ci, dir) >> {{dimension}};
+   }
+{% endblock %}
diff --git a/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp b/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e536a9f8bbff9bf3fa3c2d907029112d9126053
--- /dev/null
+++ b/python/lbmpy_walberla/templates/PackingKernels.tmpl.cpp
@@ -0,0 +1,137 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author lbmpy
+//======================================================================================================================
+
+#include "{{class_name}}.h"
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+
+/*************************************************************************************
+ *                                Kernel Definitions
+*************************************************************************************/
+//NOLINTBEGIN(readability-non-const-parameter*)
+{{ kernels['packAll']      | generate_definitions }}
+{{ kernels['unpackAll']    | generate_definitions }}
+{{ kernels['localCopyAll'] | generate_definitions }}
+
+{{ kernels['packDirection']      | generate_definitions }}
+{{ kernels['unpackDirection']    | generate_definitions }}
+{{ kernels['localCopyDirection'] | generate_definitions }}
+
+{% block AdditionalKernelDefinitions %}
+{% endblock %}
+//NOLINTEND(readability-non-const-parameter*)
+
+
+/*************************************************************************************
+ *                                 Kernel Wrappers
+*************************************************************************************/
+
+namespace walberla {
+namespace {{namespace}} {
+
+void {{class_name}}::packAll(
+   {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+          "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+
+   {{kernels['packAll'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+
+void {{class_name}}::unpackAll(
+   {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+         "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+
+   {{kernels['unpackAll'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+
+void {{class_name}}::localCopyAll(
+   {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+         "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+         kernels['localCopyAll'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+   WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+   WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+   {{kernels['localCopyAll']
+     | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+     | indent(3) }}
+}
+
+void {{class_name}}::packDirection(
+   {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+         "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(outBuffer);
+
+   {{kernels['packDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+void {{class_name}}::unpackDirection(
+   {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+         "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(inBuffer);
+
+   {{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(3) }}
+}
+
+void {{class_name}}::localCopyDirection(
+   {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+         "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+         kernels['localCopyDirection'].kernel_selection_parameters,
+          ["gpuStream_t stream"] if is_gpu else []]
+       | type_identifier_list -}}
+) const {
+   WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+   WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+   WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+   {{kernels['localCopyDirection']
+     | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+     | indent(3) }}
+}
+
+{% block AdditionalDefinitions %}
+{% endblock %}
+
+}  // namespace {{namespace}}
+}  // namespace walberla
\ No newline at end of file
diff --git a/python/lbmpy_walberla/templates/PackingKernels.tmpl.h b/python/lbmpy_walberla/templates/PackingKernels.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5371e395d948e758efde91dafdb237fdb855aa9c
--- /dev/null
+++ b/python/lbmpy_walberla/templates/PackingKernels.tmpl.h
@@ -0,0 +1,169 @@
+//======================================================================================================================
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "core/mpi/SendBuffer.h"
+#include "core/mpi/RecvBuffer.h"
+#include "domain_decomposition/IBlock.h"
+#include "field/GhostLayerField.h"
+{% if target is equalto 'gpu' -%}
+#include "gpu/GPUWrapper.h"
+#include "gpu/GPUField.h"
+{%- endif %}
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_GNU
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla{
+namespace {{namespace}} {
+
+class {{class_name}} {
+
+public:
+   using PdfField_T = {{src_field | field_type(is_gpu=is_gpu)}};
+   using value_type = typename PdfField_T::value_type;
+
+   static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%};
+
+   /**
+    * Packs all pdfs from the given cell interval to the send buffer.
+    */
+   void packAll(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+            "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Unpacks all pdfs from the send buffer to the given cell interval.
+    */
+   void unpackAll(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+            "unsigned char * inBuffer", kernels['unpackAll'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Copies data between two blocks on the same process.
+    * All pdfs from the sending interval are copied onto the receiving interval.
+    */
+   void localCopyAll(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+            "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+          kernels['localCopyAll'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+      | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Packs only those populations streaming in directions aligned with the sending direction dir from the given
+    * cell interval.
+    * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed.
+    */
+   void packDirection(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
+            "unsigned char * outBuffer", kernels['packDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given
+    * cell interval.
+    * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked.
+    */
+   void unpackDirection(
+      {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
+            "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Copies data between two blocks on the same process.
+    * PDFs streaming aligned with the direction dir are copied from the sending interval
+    * onto the receiving interval.
+    */
+   void localCopyDirection(
+      {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
+            "PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
+            kernels['localCopyDirection'].kernel_selection_parameters,
+             ["gpuStream_t stream = nullptr"] if is_gpu else []]
+          | type_identifier_list -}}
+   ) const;
+
+   /**
+    * Returns the number of bytes that will be packed from / unpacked to the cell interval
+    * when using packDirection / unpackDirection
+    * @param ci  The cell interval
+    * @param dir The communication direction
+    * @return    The required size of the buffer, in bytes
+    */
+   uint_t size (CellInterval & ci, stencil::Direction dir) const {
+      return ci.numCells() * sizes[dir] * sizeof(value_type);
+   }
+
+   /**
+    * Returns the number of bytes that will be packed from / unpacked to the cell interval
+    * when using packAll / unpackAll
+    * @param ci  The cell interval
+    * @return    The required size of the buffer, in bytes
+    */
+   uint_t size (CellInterval & ci) const {
+      return ci.numCells() * {{stencil_size}} * sizeof(value_type);
+   }
+
+   {% block AdditionalPublicDeclarations %}
+   {% endblock %}
+
+ private:
+   const uint_t sizes[{{direction_sizes|length}}] { {{ direction_sizes | join(', ') }} };
+};
+
+}  // namespace {{namespace}}
+}  // namespace walberla
diff --git a/python/lbmpy_walberla/utility.py b/python/lbmpy_walberla/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..1289c381e7b50ac7e83d34fca887e6d659959b92
--- /dev/null
+++ b/python/lbmpy_walberla/utility.py
@@ -0,0 +1,11 @@
+from lbmpy.advanced_streaming import Timestep
+
+
+def timestep_suffix(timestep: Timestep):
+    """ get the suffix as string for a timestep
+
+    :param timestep: instance of class lbmpy.advanced_streaming.Timestep
+    :return: either "even", "odd" or an empty string
+    """
+    return ("_" + str(timestep)) if timestep != Timestep.BOTH else ''
+
diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py
index 8566d3915697e28600f54524d7d43e53a98c17b7..e264fb8bbbb8c67040de8c309e40e8b57c0f7053 100644
--- a/python/lbmpy_walberla/walberla_lbm_generation.py
+++ b/python/lbmpy_walberla/walberla_lbm_generation.py
@@ -1,4 +1,6 @@
 # import warnings
+from typing import Callable, List
+
 
 import numpy as np
 import sympy as sp
@@ -18,8 +20,10 @@ from pystencils.node_collection import NodeCollection
 from pystencils.stencil import offset_to_direction_string
 from pystencils.sympyextensions import get_symmetric_part
 from pystencils.typing.transformations import add_types
-from pystencils_walberla.codegen import KernelInfo, config_from_context
+
+from pystencils_walberla.kernel_info import KernelInfo
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.utility import config_from_context
 
 cpp_printer = CustomSympyPrinter()
 REFINEMENT_SCALE_FACTOR = sp.Symbol("level_scale_factor")
@@ -155,7 +159,7 @@ def __lattice_model(generation_context, class_name, config, lb_method, stream_co
     generation_context.write_file(f"{class_name}.cpp", source)
 
 
-def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='zyxf', refinement_scaling=None,
+def generate_lattice_model(generation_context, class_name, collision_rule, field_layout='fzyx', refinement_scaling=None,
                            target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
                            **create_kernel_params):
 
diff --git a/python/lbmpy_walberla/walberla_lbm_package.py b/python/lbmpy_walberla/walberla_lbm_package.py
new file mode 100644
index 0000000000000000000000000000000000000000..e21d6c9613a1c2be87e21cbc06a2a78212f72552
--- /dev/null
+++ b/python/lbmpy_walberla/walberla_lbm_package.py
@@ -0,0 +1,53 @@
+from typing import Callable, List, Dict
+
+from pystencils import Target, Field
+
+from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
+from lbmpy.relaxationrates import get_shear_relaxation_rate
+
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+
+from lbmpy_walberla.boundary_collection import generate_boundary_collection
+from lbmpy_walberla.storage_specification import generate_lbm_storage_specification
+from lbmpy_walberla.sweep_collection import generate_lbm_sweep_collection, RefinementScaling
+
+
+def generate_lbm_package(ctx: CodeGenerationContext, name: str,
+                         collision_rule: LbmCollisionRule,
+                         lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
+                         nonuniform: bool = False, boundaries: List[Callable] = None,
+                         macroscopic_fields: Dict[str, Field] = None,
+                         target: Target = Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                         max_threads=None,
+                         **kernel_parameters):
+
+    if macroscopic_fields is None:
+        macroscopic_fields = {}
+
+    method = collision_rule.method
+
+    storage_spec_name = f'{name}StorageSpecification'
+    generate_lbm_storage_specification(ctx, storage_spec_name, method, lbm_config,
+                                       nonuniform=nonuniform, target=target, data_type=data_type)
+
+    if nonuniform:
+        omega = get_shear_relaxation_rate(method)
+        refinement_scaling = RefinementScaling()
+        refinement_scaling.add_standard_relaxation_rate_scaling(omega)
+    else:
+        refinement_scaling = None
+
+    streaming_pattern = lbm_config.streaming_pattern
+    generate_lbm_sweep_collection(ctx, f'{name}SweepCollection', collision_rule,
+                                  streaming_pattern=streaming_pattern,
+                                  field_layout=lbm_optimisation.field_layout,
+                                  refinement_scaling=refinement_scaling,
+                                  macroscopic_fields=macroscopic_fields,
+                                  target=target, data_type=data_type,
+                                  cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info,
+                                  max_threads=max_threads,
+                                  **kernel_parameters)
+
+    generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
+                                 lb_method=method, streaming_pattern=streaming_pattern,
+                                 target=target, layout=lbm_optimisation.field_layout)
diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py
index 0ea2d02cb4b93fc880f0addc38058e0363e39c8c..f78f7fcf244e7fd140cd2abcc93ebaebaea2f94f 100644
--- a/python/pystencils_walberla/__init__.py
+++ b/python/pystencils_walberla/__init__.py
@@ -1,13 +1,17 @@
 from .boundary import generate_staggered_boundary, generate_staggered_flux_boundary
-from .cmake_integration import CodeGeneration
-from .codegen import (
-    generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel,
-    generate_mpidtype_info_from_kernel, generate_sweep, get_vectorize_instruction_set, generate_selective_sweep,
-    config_from_context)
-from .utility import generate_info_header
+from .cmake_integration import CodeGeneration, ManualCodeGenerationContext
 
-__all__ = ['CodeGeneration',
-           'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info',
-           'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary',
-           'get_vectorize_instruction_set', 'generate_selective_sweep', 'config_from_context',
-           'generate_info_header']
+from .function_generator import function_generator
+from .kernel_info import KernelInfo
+from .sweep import generate_sweep, generate_selective_sweep, generate_sweep_collection
+from .pack_info import (generate_pack_info, generate_pack_info_for_field,
+                        generate_pack_info_from_kernel, generate_mpidtype_info_from_kernel)
+from .utility import generate_info_header, get_vectorize_instruction_set, config_from_context
+
+__all__ = ['generate_staggered_boundary', 'generate_staggered_flux_boundary',
+           'CodeGeneration', 'ManualCodeGenerationContext',
+           'function_generator',
+           'generate_sweep', 'generate_selective_sweep', 'generate_sweep_collection',
+           'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel',
+           'generate_mpidtype_info_from_kernel',
+           'generate_info_header', 'get_vectorize_instruction_set', 'config_from_context']
diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py
index 329b6805984e24fcdc4e4d17205ec55aa29cff06..c5a5e54c1d00d9d6e476306453eae4320b6f5aa8 100644
--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -2,13 +2,10 @@ import numpy as np
 from jinja2 import Environment, PackageLoader, StrictUndefined
 from pystencils import Field, FieldType, Target
 from pystencils.boundaries.boundaryhandling import create_boundary_kernel
-from pystencils.boundaries.createindexlist import (
-    boundary_index_array_coordinate_names, direction_member_name,
-    numpy_data_type_for_boundary_object)
+from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object
 from pystencils.typing import TypedSymbol, create_type
-from pystencils.stencil import inverse_direction
 
-from pystencils_walberla.codegen import config_from_context
+from pystencils_walberla.utility import config_from_context, struct_from_numpy_dtype
 from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
 from pystencils_walberla.additional_data_handler import AdditionalDataHandler
 from pystencils_walberla.kernel_selection import (
@@ -32,6 +29,7 @@ def generate_boundary(generation_context,
                       interface_mappings=(),
                       generate_functor=True,
                       layout='fzyx',
+                      field_timestep=None,
                       **create_kernel_params):
 
     if boundary_object.additional_data and additional_data_handler is None:
@@ -75,8 +73,9 @@ def generate_boundary(generation_context,
     else:
         raise ValueError(f"kernel_creation_function returned wrong type: {kernel.__class__}")
 
-    kernel_family = KernelFamily(selection_tree, class_name)
-    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings)
+    kernel_family = KernelFamily(selection_tree, class_name, field_timestep=field_timestep)
+    selection_parameters = kernel_family.kernel_selection_parameters if field_timestep is None else []
+    interface_spec = HighLevelInterfaceSpec(selection_parameters, interface_mappings)
 
     if additional_data_handler is None:
         additional_data_handler = AdditionalDataHandler(stencil=neighbor_stencil)
@@ -98,7 +97,8 @@ def generate_boundary(generation_context,
         'single_link': boundary_object.single_link,
         'additional_data_handler': additional_data_handler,
         'dtype': "double" if is_float else "float",
-        'layout': layout
+        'layout': layout,
+        'index_shape': index_shape
     }
 
     env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
@@ -107,10 +107,12 @@ def generate_boundary(generation_context,
     header = env.get_template('Boundary.tmpl.h').render(**context)
     source = env.get_template('Boundary.tmpl.cpp').render(**context)
 
-    source_extension = "cpp" if target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
+    return context
+
 
 def generate_staggered_boundary(generation_context, class_name, boundary_object,
                                 dim, neighbor_stencil, index_shape, target=Target.CPU, **kwargs):
@@ -126,28 +128,3 @@ def generate_staggered_flux_boundary(generation_context, class_name, boundary_ob
                       FieldType.STAGGERED_FLUX, target=target, **kwargs)
 
 
-def struct_from_numpy_dtype(struct_name, numpy_dtype):
-    result = f"struct {struct_name} {{ \n"
-
-    equality_compare = []
-    constructor_params = []
-    constructor_initializer_list = []
-    for name, (sub_type, offset) in numpy_dtype.fields.items():
-        pystencils_type = create_type(sub_type)
-        result += f"    {pystencils_type} {name};\n"
-        if name in boundary_index_array_coordinate_names or name == direction_member_name:
-            constructor_params.append(f"{pystencils_type} {name}_")
-            constructor_initializer_list.append(f"{name}({name}_)")
-        else:
-            constructor_initializer_list.append(f"{name}()")
-        if pystencils_type.is_float():
-            equality_compare.append(f"floatIsEqual({name}, o.{name})")
-        else:
-            equality_compare.append(f"{name} == o.{name}")
-
-    result += "    %s(%s) : %s {}\n" % \
-              (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list))
-    result += "    bool operator==(const %s & o) const {\n        return %s;\n    }\n" % \
-              (struct_name, " && ".join(equality_compare))
-    result += "};\n"
-    return result
diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py
index 2656cac32f9939a65ee61f7fb74541355d279bb2..4d5654c08a1474b53852f643c2cf4249a12901db 100644
--- a/python/pystencils_walberla/cmake_integration.py
+++ b/python/pystencils_walberla/cmake_integration.py
@@ -20,6 +20,7 @@ DEFAULT_CMAKE_VARS = {'WALBERLA_BUILD_WITH_OPENMP': False,
                       'WALBERLA_DOUBLE_ACCURACY': True,
                       'WALBERLA_BUILD_WITH_MPI': True,
                       'WALBERLA_BUILD_WITH_CUDA': False,
+                      'WALBERLA_BUILD_WITH_HIP': False,
                       "CODEGEN_CFG": ""}
 
 PARSE_HELPER = {"on":  True,  "1": True,  "yes": True,  "true":  True,
@@ -73,6 +74,8 @@ class CodeGenerationContext:
         self.mpi = cmake_vars['WALBERLA_BUILD_WITH_MPI']
         self.double_accuracy = cmake_vars['WALBERLA_DOUBLE_ACCURACY']
         self.cuda = cmake_vars['WALBERLA_BUILD_WITH_CUDA']
+        self.hip = cmake_vars['WALBERLA_BUILD_WITH_HIP']
+        self.gpu = self.cuda or self.hip
         self.config = cmake_vars['CODEGEN_CFG'].strip()
 
     def write_file(self, name, content):
@@ -87,18 +90,27 @@ class ManualCodeGenerationContext:
     to constructor instead of getting them from CMake
     """
 
-    def __init__(self, openmp=False, optimize_for_localhost=False, mpi=True, double_accuracy=True, cuda=False):
+    def __init__(self, openmp=False, optimize_for_localhost=False, mpi=True, double_accuracy=True,
+                 cuda=False, hip=False):
         self.openmp = openmp
         self.optimize_for_localhost = optimize_for_localhost
         self.mpi = mpi
         self.double_accuracy = double_accuracy
         self.files = dict()
         self.cuda = cuda
+        self.hip = hip
+        self.gpu = self.cuda or self.hip
         self.config = ""
 
     def write_file(self, name, content):
         self.files[name] = content
 
+    def write_all_files(self):
+        for name, content in self.files.items():
+            with open(name, 'w') as f:
+                f.write(content)
+        self.files = dict()
+
     def __enter__(self):
         return self
 
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index c5cefc06ea23d0892804b251d21cedd6cd6d67e8..ac475f72c9489d9e7b74ce25d9bf303413ae7834 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -124,7 +124,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
     elif target != kernel_family.get_ast_attr('target'):
         raise ValueError('Mismatch between target parameter and AST targets.')
 
-    if not generation_context.cuda and target == Target.GPU:
+    if not generation_context.gpu and target == Target.GPU:
         return
 
     representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
@@ -152,7 +152,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
     header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
     source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
 
-    source_extension = "cpp" if target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
@@ -344,7 +344,7 @@ def generate_pack_info(generation_context, class_name: str,
     header = env.get_template(template_name + ".h").render(**jinja_context)
     source = env.get_template(template_name + ".cpp").render(**jinja_context)
 
-    source_extension = "cpp" if config.target == Target.CPU else "cu"
+    source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
 
@@ -446,14 +446,16 @@ class KernelInfo:
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
+
+            block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
+            grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
+
+            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
             kernel_call_lines = [
-                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                  for e in indexing_dict['block']),
-                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                 for e in indexing_dict['grid']),
-                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                    stream, call_parameters),
-            ]
+                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
+                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
+                kernel_launch]
 
             return "\n".join(kernel_call_lines)
         else:
@@ -477,9 +479,9 @@ def get_vectorize_instruction_set(generation_context):
 def config_from_context(generation_context, target=Target.CPU, data_type=None,
                         cpu_openmp=None, cpu_vectorize_info=None, **kwargs):
 
-    if target == Target.GPU and not generation_context.cuda:
-        raise ValueError("can not generate cuda code if waLBerla is not build with CUDA. Please use "
-                         "-DWALBERLA_BUILD_WITH_CUDA=1 for configuring cmake")
+    if target == Target.GPU and not generation_context.gpu:
+        raise ValueError("can not generate device code if waLBerla is not build with CUDA or HIP. Please use "
+                         "-DWALBERLA_BUILD_WITH_CUDA=1 or -DWALBERLA_BUILD_WITH_HIP=1 for configuring cmake")
 
     default_dtype = "float64" if generation_context.double_accuracy else "float32"
     if data_type is None:
diff --git a/python/pystencils_walberla/function_generator.py b/python/pystencils_walberla/function_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c7b60803c59d788ce615ceeeb314b5ece0acbe1
--- /dev/null
+++ b/python/pystencils_walberla/function_generator.py
@@ -0,0 +1,77 @@
+from typing import Sequence, Union
+
+
+from pystencils import Target, Assignment, AssignmentCollection
+from pystencils import create_kernel, create_staggered_kernel
+
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec
+from pystencils_walberla.utility import config_from_context
+
+
+def function_generator(ctx: CodeGenerationContext, class_name: str,
+                       assignments: Union[Sequence[Assignment], AssignmentCollection],
+                       namespace: str = 'pystencils', staggered=False, field_swaps=None, varying_parameters=(),
+                       ghost_layers_to_include=0,
+                       target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                       max_threads=None,
+                       **create_kernel_params):
+    return lambda: __function_generator(ctx, class_name, assignments,
+                                        namespace, staggered, field_swaps, varying_parameters,
+                                        ghost_layers_to_include,
+                                        target, data_type, cpu_openmp, cpu_vectorize_info, max_threads,
+                                        **create_kernel_params)
+
+
+def __function_generator(ctx: CodeGenerationContext, class_name: str,
+                         assignments: Union[Sequence[Assignment], AssignmentCollection],
+                         namespace: str = 'pystencils', staggered=False, field_swaps=None, varying_parameters=(),
+                         ghost_layers_to_include=0,
+                         target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                         max_threads=None,
+                         **create_kernel_params):
+    if staggered:
+        assert 'omp_single_loop' not in create_kernel_params
+
+    create_kernel_params['omp_single_loop'] = False
+    config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                 cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params)
+
+    if not staggered:
+        ast = create_kernel(assignments, config=config)
+    else:
+        # This should not be necessary but create_staggered_kernel does not take a config at the moment ...
+        ast = create_staggered_kernel(assignments, **config.__dict__)
+
+    ast.function_name = class_name.lower()
+
+    all_field_names = [f.name for f in ast.fields_accessed]
+    all_field_names.sort()
+
+    temporary_fields = [f for f in all_field_names if "_tmp" in f]
+
+    if field_swaps is None:
+        field_swaps = []
+        for field_name in all_field_names:
+            if field_name + "_tmp" in temporary_fields:
+                field_swaps.append((field_name, field_name + "_tmp"))
+
+    selection_tree = KernelCallNode(ast)
+    kernel_family = KernelFamily(selection_tree, class_name,
+                                 temporary_fields, field_swaps, varying_parameters)
+
+    representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
+    representative_field = sorted(representative_field)[0]
+
+    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, ())
+
+    jinja_context = {
+        'kernel': kernel_family,
+        'namespace': namespace,
+        'function_name': class_name,
+        'field': representative_field,
+        'ghost_layers_to_include': ghost_layers_to_include,
+        'interface_spec': interface_spec,
+        'max_threads': max_threads
+    }
+    return jinja_context
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index 0ee3f3cd128e2b9e9f7c90cbb685457022f4529f..b2413bcefe8c4a4468e4971c1b8901116519c57d 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -1,11 +1,10 @@
-import jinja2
-
 # For backward compatibility with version < 3.0.0
 try:
     from jinja2 import pass_context as jinja2_context_decorator
 except ImportError:
     from jinja2 import contextfilter as jinja2_context_decorator
 
+from collections.abc import Iterable
 import sympy as sp
 
 from pystencils import Target, Backend
@@ -45,6 +44,18 @@ delete_loop = """
     }}
 """
 
+standard_parameter_registration = """
+for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+{{
+    const {dtype} level_scale_factor = {dtype}(uint_t(1) << level);
+    const {dtype} one                = {dtype}(1.0);
+    const {dtype} half               = {dtype}(0.5);
+    
+    {name}Vector.push_back( {dtype}({name} / (level_scale_factor * (-{name} * half + one) + {name} * half)) );
+}}
+"""
+
+
 # the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the
 # enum in the template files.
 def translate_target(target):
@@ -58,11 +69,17 @@ def translate_target(target):
 
 def make_field_type(dtype, f_size, is_gpu):
     if is_gpu:
-        return f"cuda::GPUField<{dtype}>"
+        return f"gpu::GPUField<{dtype}>"
     else:
         return f"field::GhostLayerField<{dtype}, {f_size}>"
 
 
+def field_type(field, is_gpu=False):
+    dtype = get_base_type(field.dtype)
+    f_size = get_field_fsize(field)
+    return make_field_type(dtype, f_size, is_gpu)
+
+
 def get_field_fsize(field):
     """Determines the size of the index coordinate. Since walberla fields only support one index dimension,
     pystencils fields with multiple index dimensions are linearized to a single index dimension.
@@ -149,35 +166,30 @@ def field_extraction_code(field, is_temporary, declaration_only=False,
         is_gpu: if the field is a GhostLayerField or a GpuField
         update_member: specify if function is used inside a constructor; add _ to members
     """
-    # Determine size of f coordinate which is a template parameter
-    f_size = get_field_fsize(field)
-    field_name = field.name
-    dtype = get_base_type(field.dtype)
-    field_type = make_field_type(dtype, f_size, is_gpu)
+    wlb_field_type = field_type(field, is_gpu)
 
     if not is_temporary:
-        dtype = get_base_type(field.dtype)
-        field_type = make_field_type(dtype, f_size, is_gpu)
         if declaration_only:
-            return f"{field_type} * {field_name}_;"
+            return f"{wlb_field_type} * {field.name}_;"
         else:
             prefix = "" if no_declaration else "auto "
             if update_member:
-                return f"{prefix}{field_name}_ = block->getData< {field_type} >({field_name}ID);"
+                return f"{prefix}{field.name}_ = block->getData< {wlb_field_type} >({field.name}ID);"
             else:
-                return f"{prefix}{field_name} = block->getData< {field_type} >({field_name}ID);"
+                return f"{prefix}{field.name} = block->getData< {wlb_field_type} >({field.name}ID);"
     else:
-        assert field_name.endswith('_tmp')
-        original_field_name = field_name[:-len('_tmp')]
+        assert field.name.endswith('_tmp')
+        original_field_name = field.name[:-len('_tmp')]
         if declaration_only:
-            return f"{field_type} * {field_name}_;"
+            return f"{wlb_field_type} * {field.name}_;"
         else:
-            declaration = f"{field_type} * {field_name};"
+            declaration = f"{wlb_field_type} * {field.name};"
             tmp_field_str = temporary_fieldTemplate.format(original_field_name=original_field_name,
-                                                           tmp_field_name=field_name, type=field_type)
+                                                           tmp_field_name=field.name, type=wlb_field_type)
             return tmp_field_str if no_declaration else declaration + tmp_field_str
 
 
+# TODO fields are not sorted
 @jinja2_context_decorator
 def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_ignore=(), parameters=None,
                                             declarations_only=False, no_declarations=False, update_member=False):
@@ -213,11 +225,22 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno
     return result
 
 
-def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False):
+def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False,
+                                        parameter_registration=None):
     symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer and not ignore_fields}
     symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter)
     symbols.difference_update(parameters_to_ignore)
-    return "\n".join("auto & %s = %s%s_;" % (s, prefix, s) for s in symbols)
+    type_information = {p.symbol.name: p.symbol.dtype for p in kernel_info.parameters if not p.is_field_parameter}
+    result = []
+    registered_parameters = [] if not parameter_registration else parameter_registration.scaling_info
+    for s in symbols:
+        if s in registered_parameters:
+            dtype = type_information[s].c_name
+            result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
+            result.append(f"{dtype} & {s} = {s}Vector[level];")
+        else:
+            result.append(f"auto & {s} = {prefix}{s}_;")
+    return "\n".join(result)
 
 
 @jinja2_context_decorator
@@ -236,8 +259,8 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
         cell_interval: Defines the name (string) of a walberla CellInterval object in scope,
                        that defines the inner region for the kernel to loop over. Parameter has to be left to default
                        if ghost_layers_to_include is specified.
-        stream: optional name of cuda stream variable
-        spatial_shape_symbols: relevant only for gpu kernels - to determine CUDA block and grid sizes the iteration
+        stream: optional name of gpu stream variable
+        spatial_shape_symbols: relevant only for gpu kernels - to determine GPU block and grid sizes the iteration
                                region (i.e. field shape) has to be known. This can normally be inferred by the kernel
                                parameters - however in special cases like boundary conditions a manual specification
                                may be necessary.
@@ -262,33 +285,38 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
         required_ghost_layers = 0
     else:
         # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. )
-        required_ghost_layers = max(max(kernel_ghost_layers))
+        if isinstance(kernel_ghost_layers, int):
+            required_ghost_layers = kernel_ghost_layers
+        else:
+            required_ghost_layers = max(max(kernel_ghost_layers))
 
     kernel_call_lines = []
 
+    def get_cell_interval(field_object):
+        if isinstance(cell_interval, str):
+            return cell_interval
+        elif isinstance(cell_interval, dict):
+            return cell_interval[field_object]
+        else:
+            return None
+
     def get_start_coordinates(field_object):
-        if cell_interval is None:
+        ci = get_cell_interval(field_object)
+        if ci is None:
             return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions
         else:
             assert ghost_layers_to_include == 0
-            if field_object.spatial_dimensions == 3:
-                return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers
-                        for coord_name in ('x', 'y', 'z')]
-            elif field_object.spatial_dimensions == 2:
-                return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers
-                        for coord_name in ('x', 'y')]
-            else:
-                raise NotImplementedError(f"Only 2D and 3D fields are supported but a field with "
-                                          f"{field_object.spatial_dimensions} dimensions was passed")
+            return [sp.Symbol(f"{ci}.{coord_name}Min()") - required_ghost_layers for coord_name in ('x', 'y', 'z')]
 
     def get_end_coordinates(field_object):
-        if cell_interval is None:
+        ci = get_cell_interval(field_object)
+        if ci is None:
             shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions]
             offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers
-            return [f"cell_idx_c({field_object.name}->{e}) + {offset}" for e in shape_names]
+            return [f"int64_c({field_object.name}->{e}) + {offset}" for e in shape_names]
         else:
             assert ghost_layers_to_include == 0
-            return [f"cell_idx_c({cell_interval}.{coord_name}Size()) + {2 * required_ghost_layers}"
+            return [f"int64_c({ci}.{coord_name}Size()) + {2 * required_ghost_layers}"
                     for coord_name in ('x', 'y', 'z')]
 
     for param in ast_params:
@@ -305,21 +333,21 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
                 coord_set = set(coordinates)
                 coord_set = sorted(coord_set, key=lambda e: str(e))
                 for c in coord_set:
-                    kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({c}, -{actual_gls});")
+                    kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({c}, -{actual_gls})")
                 while len(coordinates) < 4:
                     coordinates.append(0)
                 coordinates = tuple(coordinates)
                 kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name}->dataAt"
                                          f"({coordinates[0]}, {coordinates[1]}, {coordinates[2]}, {coordinates[3]});")
                 if assume_inner_stride_one and field.index_dimensions > 0:
-                    kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx);")
+                    kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx)")
                 if instruction_set and assume_aligned:
                     if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{instruction_set['cachelineSize']}, 0);")
+                                                 f"{instruction_set['cachelineSize']}, 0)")
                     else:
                         kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                                 f"{instruction_set['bytes']}, 0);")
+                                                 f"{instruction_set['bytes']}, 0)")
         elif param.is_field_stride:
             casted_stride = get_field_stride(param)
             type_str = param.symbol.dtype.c_name
@@ -331,17 +359,17 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
             shape = f"{type_str}({get_end_coordinates(field)[coord]})"
             assert coord < 3
             max_value = f"{field.name}->{('x', 'y', 'z')[coord]}SizeWithGhostLayer()"
-            kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape});")
+            kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape})")
             kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {shape};")
             if assume_inner_stride_one and field.index_dimensions > 0:
-                kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx);")
+                kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx)")
             if instruction_set and assume_aligned:
                 if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{instruction_set['cachelineSize']}, 0);")
+                                             f"{instruction_set['cachelineSize']}, 0)")
                 else:
                     kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %"
-                                             f"{instruction_set['bytes']}, 0);")
+                                             f"{instruction_set['bytes']}, 0)")
 
     kernel_call_lines.append(kernel.generate_kernel_invocation_code(stream=stream,
                                                                     spatial_shape_symbols=spatial_shape_symbols))
@@ -349,6 +377,38 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
     return "\n".join(kernel_call_lines)
 
 
+@jinja2_context_decorator
+def generate_function_collection_call(ctx, kernel_info, parameters_to_ignore=(), cell_interval=None, ghost_layers=None):
+    target = translate_target(ctx['target'])
+    is_gpu = target == Target.GPU
+
+    parameters = []
+    for param in kernel_info.parameters:
+        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
+            parameters.append(param.field_name)
+
+    for param in kernel_info.parameters:
+        if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
+            parameters.append(param.symbol.name)
+
+    # TODO due to backward compatibility with high level interface spec
+    for parameter in kernel_info.kernel_selection_tree.get_selection_parameter_list():
+        if parameter.name not in parameters_to_ignore:
+            parameters.append(parameter.name)
+
+    if cell_interval:
+        assert ghost_layers is None, "If a cell interval is specified ghost layers can not be specified"
+        parameters.append(cell_interval)
+
+    if ghost_layers:
+        parameters.append(ghost_layers)
+
+    if is_gpu and "gpuStream" not in parameters_to_ignore:
+        parameters.append(f"gpuStream")
+
+    return ", ".join(parameters)
+
+
 def generate_swaps(kernel_info):
     """Generates code to swap main fields with temporary fields"""
     swaps = ""
@@ -357,115 +417,229 @@ def generate_swaps(kernel_info):
     return swaps
 
 
-def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=None):
-    if parameters_to_ignore is None:
-        parameters_to_ignore = []
+def generate_timestep_advancements(kernel_info, advance=True):
+    """Generates code to detect even or odd timestep"""
+    if kernel_info.field_timestep:
+        field_name = kernel_info.field_timestep["field_name"]
+        advancement_function = kernel_info.field_timestep["function"]
+        if advancement_function == "advanceTimestep" and advance is False:
+            advancement_function = "getTimestepPlusOne"
+        return f"uint8_t timestep = {field_name}->{advancement_function}();"
+    return ""
+
 
-    parameters_to_ignore += kernel_info.temporary_fields
+def generate_constructor_initializer_list(kernel_infos, parameters_to_ignore=None, parameter_registration=None):
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+
+    parameters_to_skip = []
+    if parameters_to_ignore is not None:
+        parameters_to_skip = [p for p in parameters_to_ignore]
+
+    for kernel_info in kernel_infos:
+        parameters_to_skip += kernel_info.temporary_fields
 
     parameter_initializer_list = []
     # First field pointer
-    for param in kernel_info.parameters:
-        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
-            parameter_initializer_list.append(f"{param.field_name}ID({param.field_name}ID_)")
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if param.is_field_pointer and param.field_name not in parameters_to_skip:
+                parameter_initializer_list.append(f"{param.field_name}ID({param.field_name}ID_)")
+                parameters_to_skip.append(param.field_name)
 
     # Then free parameters
-    for param in kernel_info.parameters:
-        if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
-            parameter_initializer_list.append(f"{param.symbol.name}_({param.symbol.name})")
+    if parameter_registration is not None:
+        parameters_to_skip.extend(parameter_registration.scaling_info)
+
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if not param.is_field_parameter and param.symbol.name not in parameters_to_skip:
+                parameter_initializer_list.append(f"{param.symbol.name}_({param.symbol.name})")
+                parameters_to_skip.append(param.symbol.name)
 
     return ", ".join(parameter_initializer_list)
 
 
-def generate_constructor_parameters(kernel_info, parameters_to_ignore=None):
-    if parameters_to_ignore is None:
-        parameters_to_ignore = []
+# TODO check varying_parameters
+def generate_constructor_parameters(kernel_infos, parameters_to_ignore=None):
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+
+    parameters_to_skip = []
+    if parameters_to_ignore is not None:
+        parameters_to_skip = [p for p in parameters_to_ignore]
 
     varying_parameters = []
-    if hasattr(kernel_info, 'varying_parameters'):
-        varying_parameters = kernel_info.varying_parameters
-    varying_parameter_names = tuple(e[1] for e in varying_parameters)
-    parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names
+    for kernel_info in kernel_infos:
+        if hasattr(kernel_info, 'varying_parameters'):
+            varying_parameters = kernel_info.varying_parameters
+        varying_parameter_names = tuple(e[1] for e in varying_parameters)
+        parameters_to_skip += kernel_info.temporary_fields + varying_parameter_names
 
     parameter_list = []
     # First field pointer
-    for param in kernel_info.parameters:
-        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
-            parameter_list.append(f"BlockDataID {param.field_name}ID_")
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if param.is_field_pointer and param.field_name not in parameters_to_skip:
+                parameter_list.append(f"BlockDataID {param.field_name}ID_")
+                parameters_to_skip.append(param.field_name)
 
     # Then free parameters
-    for param in kernel_info.parameters:
-        if not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
-            parameter_list.append(f"{param.symbol.dtype} {param.symbol.name}")
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if not param.is_field_parameter and param.symbol.name not in parameters_to_skip:
+                parameter_list.append(f"{param.symbol.dtype} {param.symbol.name}")
+                parameters_to_skip.append(param.symbol.name)
 
     varying_parameters = ["%s %s" % e for e in varying_parameters]
     return ", ".join(parameter_list + varying_parameters)
 
 
 def generate_constructor_call_arguments(kernel_info, parameters_to_ignore=None):
-    if parameters_to_ignore is None:
-        parameters_to_ignore = []
+    parameters_to_skip = []
+    if parameters_to_ignore is not None:
+        parameters_to_skip = [p for p in parameters_to_ignore]
 
     varying_parameters = []
     if hasattr(kernel_info, 'varying_parameters'):
         varying_parameters = kernel_info.varying_parameters
     varying_parameter_names = tuple(e[1] for e in varying_parameters)
-    parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names
+    parameters_to_skip += kernel_info.temporary_fields + varying_parameter_names
 
     parameter_list = []
     for param in kernel_info.parameters:
-        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
+        if param.is_field_pointer and param.field_name not in parameters_to_skip:
             parameter_list.append(f"{param.field_name}ID")
-        elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
+        elif not param.is_field_parameter and param.symbol.name not in parameters_to_skip:
             parameter_list.append(f'{param.symbol.name}_')
     varying_parameters = [f"{e}_" for e in varying_parameter_names]
     return ", ".join(parameter_list + varying_parameters)
 
 
 @jinja2_context_decorator
-def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False):
-    fields = {f.name: f for f in kernel_info.fields_accessed}
+def generate_members(ctx, kernel_infos, parameters_to_ignore=None, only_fields=False, parameter_registration=None):
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+
+    if parameters_to_ignore is None:
+        parameters_to_ignore = []
+
+    params_to_skip = [p for p in parameters_to_ignore]
+
+    fields = dict()
+    for kernel_info in kernel_infos:
+        for field in kernel_info.fields_accessed:
+            fields[field.name] = field
+
+    varying_parameters = []
+    for kernel_info in kernel_infos:
+        if hasattr(kernel_info, 'varying_parameters'):
+            varying_parameters = kernel_info.varying_parameters
+        varying_parameter_names = tuple(e[1] for e in varying_parameters)
+        params_to_skip += kernel_info.temporary_fields
+        params_to_skip += varying_parameter_names
 
-    params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields)
-    params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters)
     target = translate_target(ctx['target'])
     is_gpu = target == Target.GPU
 
     result = []
-    for param in kernel_info.parameters:
-        if only_fields and not param.is_field_parameter:
-            continue
-        if param.is_field_pointer and param.field_name not in params_to_skip:
-            result.append(f"BlockDataID {param.field_name}ID;")
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if only_fields and not param.is_field_parameter:
+                continue
+            if param.is_field_pointer and param.field_name not in params_to_skip:
+                result.append(f"BlockDataID {param.field_name}ID;")
+                params_to_skip.append(param.field_name)
+
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if only_fields and not param.is_field_parameter:
+                continue
+            if not param.is_field_parameter and param.symbol.name not in params_to_skip:
+                if parameter_registration and param.symbol.name in parameter_registration.scaling_info:
+                    result.append(f"std::vector<{param.symbol.dtype}> {param.symbol.name}Vector;")
+                else:
+                    result.append(f"{param.symbol.dtype} {param.symbol.name}_;")
+                params_to_skip.append(param.symbol.name)
+
+    for kernel_info in kernel_infos:
+        for field_name in kernel_info.temporary_fields:
+            f = fields[field_name]
+            if field_name in parameters_to_ignore:
+                continue
+            parameters_to_ignore.append(field_name)
+            assert field_name.endswith('_tmp')
+            original_field_name = field_name[:-len('_tmp')]
+            f_size = get_field_fsize(f)
+            field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
+            result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
+
+    for kernel_info in kernel_infos:
+        if hasattr(kernel_info, 'varying_parameters'):
+            result.extend(["%s %s_;" % e for e in kernel_info.varying_parameters])
 
+    return "\n".join(result)
+
+
+@jinja2_context_decorator
+def generate_plain_parameter_list(ctx, kernel_info, cell_interval=None, ghost_layers=None, stream=None):
+    fields = {f.name: f for f in kernel_info.fields_accessed}
+    target = translate_target(ctx['target'])
+    is_gpu = target == Target.GPU
+
+    result = []
     for param in kernel_info.parameters:
-        if only_fields and not param.is_field_parameter:
+        if not param.is_field_parameter:
             continue
-        if not param.is_field_parameter and param.symbol.name not in params_to_skip:
-            result.append(f"{param.symbol.dtype} {param.symbol.name}_;")
+        if param.is_field_pointer and param.field_name:
+            f = fields[param.field_name]
+            f_size = get_field_fsize(f)
+            field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
+            result.append(f"{field_type} * {param.field_name}")
 
-    for field_name in kernel_info.temporary_fields:
-        f = fields[field_name]
-        if field_name in parameters_to_ignore:
-            continue
-        assert field_name.endswith('_tmp')
-        original_field_name = field_name[:-len('_tmp')]
-        f_size = get_field_fsize(f)
-        field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
-        result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
+    for param in kernel_info.parameters:
+        if not param.is_field_parameter and param.symbol.name:
+            result.append(f"{param.symbol.dtype} {param.symbol.name}")
 
     if hasattr(kernel_info, 'varying_parameters'):
         result.extend(["%s %s_;" % e for e in kernel_info.varying_parameters])
 
-    return "\n".join(result)
+    # TODO due to backward compatibility with high level interface spec
+    for parameter in kernel_info.kernel_selection_tree.get_selection_parameter_list():
+        result.append(f"{parameter.dtype} {parameter.name}")
+
+    if cell_interval:
+        result.append(f"const CellInterval & {cell_interval}")
+
+    if ghost_layers is not None:
+        if type(ghost_layers) in (int, ):
+            result.append(f"const cell_idx_t ghost_layers = {ghost_layers}")
+        else:
+            result.append(f"const cell_idx_t ghost_layers")
+
+    if is_gpu:
+        if stream is not None:
+            result.append(f"gpuStream_t stream = {stream}")
+        else:
+            result.append(f"gpuStream_t stream")
+
+    return ", ".join(result)
 
 
-def generate_destructor(kernel_info, class_name):
-    if not kernel_info.temporary_fields:
+def generate_destructor(kernel_infos, class_name):
+    temporary_fields = []
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+    for kernel_info in kernel_infos:
+        for tmp_field in kernel_info.temporary_fields:
+            if tmp_field not in temporary_fields:
+                temporary_fields.append(tmp_field)
+
+    if not temporary_fields:
         return ""
     else:
         contents = ""
-        for field_name in kernel_info.temporary_fields:
+        for field_name in temporary_fields:
             contents += delete_loop.format(original_field_name=field_name[:-len('_tmp')])
         return temporary_constructor.format(contents=contents, class_name=class_name)
 
@@ -500,6 +674,47 @@ def nested_class_method_definition_prefix(ctx, nested_class_name):
         return f"{outer_class}::{nested_class_name}"
 
 
+@jinja2_context_decorator
+def generate_parameter_registration(ctx, kernel_infos, parameter_registration):
+    if parameter_registration is None:
+        return ""
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+
+    params_to_skip = []
+    result = []
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if not param.is_field_parameter and param.symbol.name not in params_to_skip:
+                if param.symbol.name in parameter_registration.scaling_info:
+                    result.append(standard_parameter_registration.format(dtype=param.symbol.dtype,
+                                                                         name=param.symbol.name))
+                    params_to_skip.append(param.symbol.name)
+
+    return "\n".join(result)
+
+
+@jinja2_context_decorator
+def generate_constructor(ctx, kernel_infos, parameter_registration):
+    if parameter_registration is None:
+        return ""
+    if not isinstance(kernel_infos, Iterable):
+        kernel_infos = [kernel_infos]
+
+    params_to_skip = []
+    result = []
+    for kernel_info in kernel_infos:
+        for param in kernel_info.parameters:
+            if not param.is_field_parameter and param.symbol.name not in params_to_skip:
+                if param.symbol.name in parameter_registration.scaling_info:
+                    name = param.symbol.name
+                    dtype = param.symbol.dtype
+                    result.append(standard_parameter_registration.format(dtype=dtype, name=name))
+                    params_to_skip.append(name)
+
+    return "\n".join(result)
+
+
 def generate_list_of_expressions(expressions, prepend=''):
     if len(expressions) == 0:
         return ''
@@ -516,7 +731,7 @@ def type_identifier_list(nested_arg_list):
 
     def recursive_flatten(arg_list):
         for s in arg_list:
-            if isinstance(s, str):
+            if isinstance(s, str) and len(s) > 0:
                 result.append(s)
             elif isinstance(s, TypedSymbol):
                 result.append(f"{s.dtype} {s.name}")
@@ -553,16 +768,22 @@ def add_pystencils_filters_to_jinja_env(jinja_env):
     jinja_env.filters['generate_definitions'] = generate_definitions
     jinja_env.filters['generate_declarations'] = generate_declarations
     jinja_env.filters['generate_members'] = generate_members
+    jinja_env.filters['generate_plain_parameter_list'] = generate_plain_parameter_list
     jinja_env.filters['generate_constructor_parameters'] = generate_constructor_parameters
     jinja_env.filters['generate_constructor_initializer_list'] = generate_constructor_initializer_list
     jinja_env.filters['generate_constructor_call_arguments'] = generate_constructor_call_arguments
     jinja_env.filters['generate_call'] = generate_call
+    jinja_env.filters['generate_function_collection_call'] = generate_function_collection_call
     jinja_env.filters['generate_block_data_to_field_extraction'] = generate_block_data_to_field_extraction
+    jinja_env.filters['generate_timestep_advancements'] = generate_timestep_advancements
     jinja_env.filters['generate_swaps'] = generate_swaps
     jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters
     jinja_env.filters['generate_destructor'] = generate_destructor
     jinja_env.filters['generate_field_type'] = generate_field_type
     jinja_env.filters['nested_class_method_definition_prefix'] = nested_class_method_definition_prefix
+    jinja_env.filters['generate_parameter_registration'] = generate_parameter_registration
+    jinja_env.filters['generate_constructor'] = generate_constructor
     jinja_env.filters['type_identifier_list'] = type_identifier_list
     jinja_env.filters['identifier_list'] = identifier_list
     jinja_env.filters['list_of_expressions'] = generate_list_of_expressions
+    jinja_env.filters['field_type'] = field_type
diff --git a/python/pystencils_walberla/kernel_info.py b/python/pystencils_walberla/kernel_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..1382d94f4220495da28bf02113636fdf8addbaf1
--- /dev/null
+++ b/python/pystencils_walberla/kernel_info.py
@@ -0,0 +1,67 @@
+from functools import reduce
+
+from pystencils import Target
+
+from pystencils.backends.cbackend import get_headers
+from pystencils.backends.cuda_backend import CudaSympyPrinter
+from pystencils.typing.typed_sympy import SHAPE_DTYPE
+from pystencils.typing import TypedSymbol
+
+from pystencils_walberla.utility import merge_sorted_lists
+
+
+# TODO KernelInfo and KernelFamily should have same interface
+class KernelInfo:
+    def __init__(self, ast, temporary_fields=(), field_swaps=(), varying_parameters=()):
+        self.ast = ast
+        self.temporary_fields = tuple(temporary_fields)
+        self.field_swaps = tuple(field_swaps)
+        self.varying_parameters = tuple(varying_parameters)
+        self.parameters = ast.get_parameters()  # cache parameters here
+
+    @property
+    def fields_accessed(self):
+        return self.ast.fields_accessed
+
+    def get_ast_attr(self, name):
+        """Returns the value of an attribute of the AST managed by this KernelInfo.
+        For compatibility with KernelFamily."""
+        return self.ast.__getattribute__(name)
+
+    def get_headers(self):
+        all_headers = [list(get_headers(self.ast))]
+        return reduce(merge_sorted_lists, all_headers)
+
+    def generate_kernel_invocation_code(self, **kwargs):
+        ast = self.ast
+        ast_params = self.parameters
+        is_cpu = self.ast.target == Target.CPU
+        call_parameters = ", ".join([p.symbol.name for p in ast_params])
+
+        if not is_cpu:
+            stream = kwargs.get('stream', '0')
+            spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())
+
+            if not spatial_shape_symbols:
+                spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape]
+                spatial_shape_symbols.sort(key=lambda e: e.coordinate)
+            else:
+                spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols]
+
+            assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\
+                "Please only use kernels for generic field sizes!"
+
+            indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
+            sp_printer_c = CudaSympyPrinter()
+            kernel_call_lines = [
+                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                  for e in indexing_dict['block']),
+                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
+                                                                 for e in indexing_dict['grid']),
+                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
+                                                                    stream, call_parameters),
+            ]
+
+            return "\n".join(kernel_call_lines)
+        else:
+            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py
index b2e831cc8e6a8fb111d9620c154d98ef1c1c7717..c946f85105185159e317ea18d4740667cd7761c7 100644
--- a/python/pystencils_walberla/kernel_selection.py
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -8,6 +8,8 @@ from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import CudaSympyPrinter
 from pystencils.typing.typed_sympy import SHAPE_DTYPE
 
+from pystencils_walberla.utility import merge_lists_of_symbols, merge_sorted_lists
+
 
 """
 
@@ -120,6 +122,41 @@ class AbstractConditionNode(AbstractKernelSelectionNode, ABC):
         return code
 
 
+class SwitchNode(AbstractKernelSelectionNode):
+    def __init__(self, parameter_symbol, cases_dict):
+        self.cases_dict = cases_dict
+        self.parameter_symbol = parameter_symbol
+
+    @property
+    def selection_parameters(self):
+        return {self.parameter_symbol}
+
+    def collect_kernel_calls(self):
+        return reduce(lambda x, y: x | y.collect_kernel_calls(), self.cases_dict.values(), set())
+
+    def collect_selection_parameters(self):
+        return reduce(lambda x, y: x | y.collect_selection_parameters(),
+                      self.cases_dict.values(),
+                      self.selection_parameters)
+
+    def get_code(self, **kwargs):
+        def case_code(case, subtree):
+            code = f"case {case} : {{\n"
+            code += do_indent(subtree.get_code(**kwargs), width=4, first=True)
+            code += "\n    break;\n}"
+            return code
+
+        cases = [case_code(k, v) for k, v in self.cases_dict.items()]
+        switch_code = f"switch ({self.parameter_symbol.name}) {{\n"
+
+        switch_body = '\n'.join(cases)
+        switch_body = do_indent(switch_body, width=4, first=True)
+
+        switch_code += switch_body
+        switch_code += "default: break; \n}"
+        return switch_code
+
+
 class KernelCallNode(AbstractKernelSelectionNode):
     def __init__(self, ast):
         self.ast = ast
@@ -153,14 +190,16 @@ class KernelCallNode(AbstractKernelSelectionNode):
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
+
+            block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
+            grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
+
+            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+
             kernel_call_lines = [
-                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                  for e in indexing_dict['block']),
-                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                 for e in indexing_dict['grid']),
-                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                    stream, call_parameters),
-            ]
+                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
+                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
+                kernel_launch]
 
             return "\n".join(kernel_call_lines)
         else:
@@ -190,22 +229,29 @@ class SimpleBooleanCondition(AbstractConditionNode):
 class KernelFamily:
     def __init__(self, kernel_selection_tree: AbstractKernelSelectionNode,
                  class_name: str,
-                 temporary_fields=(), field_swaps=(), varying_parameters=()):
+                 temporary_fields=(), field_swaps=(), varying_parameters=(),
+                 field_timestep=None):
         self.kernel_selection_tree = kernel_selection_tree
         self.kernel_selection_parameters = kernel_selection_tree.get_selection_parameter_list()
         self.temporary_fields = tuple(temporary_fields)
         self.field_swaps = tuple(field_swaps)
+        self.field_timestep = field_timestep
         self.varying_parameters = tuple(varying_parameters)
 
         all_kernel_calls = self.kernel_selection_tree.collect_kernel_calls()
         all_param_lists = [k.parameters for k in all_kernel_calls]
         asts_list = [k.ast for k in all_kernel_calls]
         self.representative_ast = asts_list[0]
+        self.target = self.representative_ast.target
 
         #   Eliminate duplicates
         self.all_asts = set(asts_list)
 
-        #   Check function names for uniqueness and reformat them
+        # TODO due to backward compatibility with high level interface spec
+        if self.field_timestep is not None:
+            self.kernel_selection_parameters = []
+
+    #   Check function names for uniqueness and reformat them
         #   using the class name
         function_names = [ast.function_name.lower() for ast in self.all_asts]
         unique_names = set(function_names)
@@ -256,7 +302,7 @@ class AbstractInterfaceArgumentMapping:
         raise NotImplementedError()
 
     @property
-    def headers(self):
+    def headers(self) -> Set:
         return set()
 
 
@@ -310,34 +356,4 @@ class HighLevelInterfaceSpec:
 # ---------------------------------- Helpers --------------------------------------------------------------------------
 
 
-def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None):
-    if identity_check_key is None:
-        identity_check_key = sort_key
-    nx = len(lx)
-    ny = len(ly)
-
-    def recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern):
-        if ix_intern == nx:
-            return ly_intern[iy_intern:]
-        if iy_intern == ny:
-            return lx_intern[ix_intern:]
-        x = lx_intern[ix_intern]
-        y = ly_intern[iy_intern]
-        skx = sort_key(x)
-        sky = sort_key(y)
-        if skx == sky:
-            if identity_check_key(x) == identity_check_key(y):
-                return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern + 1)
-            else:
-                raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!')
-        elif skx < sky:
-            return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern)
-        else:
-            return [y] + recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern + 1)
-    return recursive_merge(lx, ly, 0, 0)
-
 
-def merge_lists_of_symbols(lists):
-    def merger(lx, ly):
-        return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol)
-    return reduce(merger, lists)
diff --git a/python/pystencils_walberla/pack_info.py b/python/pystencils_walberla/pack_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..221a946e004143f0f02c3a2663df6726add4027f
--- /dev/null
+++ b/python/pystencils_walberla/pack_info.py
@@ -0,0 +1,288 @@
+from collections import OrderedDict, defaultdict
+from dataclasses import replace
+from itertools import product
+from typing import Dict, Optional, Sequence, Tuple
+
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils import Assignment, AssignmentCollection, Field, FieldType, Target, create_kernel
+from pystencils.backends.cbackend import get_headers
+from pystencils.stencil import inverse_direction, offset_to_direction_string
+
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.kernel_info import KernelInfo
+from pystencils_walberla.utility import config_from_context
+
+
+def generate_pack_info_for_field(ctx: CodeGenerationContext, class_name: str, field: Field,
+                                 direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None,
+                                 operator=None, gl_to_inner=False,
+                                 target=Target.CPU, data_type=None, cpu_openmp=False,
+                                 **create_kernel_params):
+    """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements.
+
+    Args:
+        ctx: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        field: pystencils field for which to generate pack info
+        direction_subset: optional sequence of directions for which values should be packed
+                          otherwise a D3Q27 stencil is assumed
+        operator: optional operator for, e.g., reduction pack infos
+        gl_to_inner: communicates values from ghost layers of sender to interior of receiver
+        target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target
+        data_type: default datatype for the kernel creation. Default is double
+        cpu_openmp: if loops should use openMP or not.
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+
+    if not direction_subset:
+        direction_subset = tuple((i, j, k) for i, j, k in product(*[(-1, 0, 1)] * 3))
+
+    all_index_accesses = [field(*ind) for ind in product(*[range(s) for s in field.index_shape])]
+    return generate_pack_info(ctx, class_name, {direction_subset: all_index_accesses}, operator=operator,
+                              gl_to_inner=gl_to_inner, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                              **create_kernel_params)
+
+
+def generate_pack_info_from_kernel(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment],
+                                   kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False,
+                                   **create_kernel_params):
+    """Generates a waLBerla GPU PackInfo from a (pull) kernel.
+
+    Args:
+        ctx: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        assignments: list of assignments from the compute kernel - generates PackInfo for "pull" part only
+                     i.e. the kernel is expected to only write to the center
+        kind: can either be pull or push
+        operator: optional operator for, e.g., reduction pack infos
+        target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target
+        data_type: default datatype for the kernel creation. Default is double
+        cpu_openmp: if loops should use openMP or not.
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    assert kind in ('push', 'pull')
+    reads = set()
+    writes = set()
+
+    if isinstance(assignments, AssignmentCollection):
+        assignments = assignments.all_assignments
+
+    for a in assignments:
+        if not isinstance(a, Assignment):
+            continue
+        reads.update(a.rhs.atoms(Field.Access))
+        writes.update(a.lhs.atoms(Field.Access))
+    spec = defaultdict(set)
+    if kind == 'pull':
+        for fa in reads:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            comm_direction = inverse_direction(fa.offsets)
+            for comm_dir in _comm_directions(comm_direction):
+                spec[(comm_dir,)].add(fa.field.center(*fa.index))
+    elif kind == 'push':
+        for fa in writes:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            for comm_dir in _comm_directions(fa.offsets):
+                spec[(comm_dir,)].add(fa)
+    else:
+        raise ValueError("Invalid 'kind' parameter")
+    return generate_pack_info(ctx, class_name, spec, operator=operator,
+                              target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params)
+
+
+def generate_pack_info(ctx: CodeGenerationContext, class_name: str,
+                       directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]],
+                       namespace='pystencils', operator=None, gl_to_inner=False,
+                       target=Target.CPU, data_type=None, cpu_openmp=False,
+                       **create_kernel_params):
+    """Generates a waLBerla GPU PackInfo
+
+    Args:
+        ctx: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        directions_to_pack_terms: maps tuples of directions to read field accesses, specifying which values have to be
+                                  packed for which direction
+        namespace: inner namespace of the generated class
+        operator: optional operator for, e.g., reduction pack infos
+        gl_to_inner: communicates values from ghost layers of sender to interior of receiver
+        target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target
+        data_type: default datatype for the kernel creation. Default is double
+        cpu_openmp: if loops should use openMP or not.
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    if cpu_openmp:
+        raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus "
+                         "additionally parallelising each kernel is not supported.")
+    items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()]
+    items = sorted(items, key=lambda e: e[0])
+    directions_to_pack_terms = OrderedDict(items)
+
+    config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                 **create_kernel_params)
+
+    config_zero_gl = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                         ghost_layers=0, **create_kernel_params)
+
+    # Vectorisation of the pack info is not implemented.
+    config = replace(config, cpu_vectorize_info=None)
+    config_zero_gl = replace(config_zero_gl, cpu_vectorize_info=None)
+
+    config = replace(config, allow_double_writes=True)
+    config_zero_gl = replace(config_zero_gl, allow_double_writes=True)
+
+    template_name = "CpuPackInfo.tmpl" if config.target == Target.CPU else 'GpuPackInfo.tmpl'
+
+    fields_accessed = set()
+    for terms in directions_to_pack_terms.values():
+        for term in terms:
+            assert isinstance(term, Field.Access)  # and all(e == 0 for e in term.offsets)
+            fields_accessed.add(term)
+
+    field_names = {fa.field.name for fa in fields_accessed}
+
+    data_types = {fa.field.dtype for fa in fields_accessed}
+    if len(data_types) == 0:
+        raise ValueError("No fields to pack!")
+    if len(data_types) != 1:
+        err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed)
+        raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail)
+    dtype = data_types.pop()
+
+    pack_kernels = OrderedDict()
+    unpack_kernels = OrderedDict()
+    all_accesses = set()
+    elements_per_cell = OrderedDict()
+    for direction_set, terms in directions_to_pack_terms.items():
+        for d in direction_set:
+            if not all(abs(i) <= 1 for i in d):
+                raise NotImplementedError("Only first neighborhood supported")
+
+        buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER,
+                                      dtype=dtype.numpy_dtype, index_shape=(len(terms),))
+
+        direction_strings = tuple(offset_to_direction_string(d) for d in direction_set)
+        all_accesses.update(terms)
+
+        pack_assignments = [Assignment(buffer(i), term) for i, term in enumerate(terms)]
+        pack_ast = create_kernel(pack_assignments, config=config_zero_gl)
+        pack_ast.function_name = 'pack_{}'.format("_".join(direction_strings))
+        if operator is None:
+            unpack_assignments = [Assignment(term, buffer(i)) for i, term in enumerate(terms)]
+        else:
+            unpack_assignments = [Assignment(term, operator(term, buffer(i))) for i, term in enumerate(terms)]
+        unpack_ast = create_kernel(unpack_assignments, config=config_zero_gl)
+        unpack_ast.function_name = 'unpack_{}'.format("_".join(direction_strings))
+
+        pack_kernels[direction_strings] = KernelInfo(pack_ast)
+        unpack_kernels[direction_strings] = KernelInfo(unpack_ast)
+        elements_per_cell[direction_strings] = len(terms)
+    fused_kernel = create_kernel([Assignment(buffer.center, t) for t in all_accesses], config=config)
+
+    jinja_context = {
+        'class_name': class_name,
+        'pack_kernels': pack_kernels,
+        'unpack_kernels': unpack_kernels,
+        'fused_kernel': KernelInfo(fused_kernel),
+        'elements_per_cell': elements_per_cell,
+        'headers': get_headers(fused_kernel),
+        'target': config.target.name.lower(),
+        'dtype': dtype,
+        'field_name': field_names.pop(),
+        'namespace': namespace,
+        'gl_to_inner': gl_to_inner,
+    }
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+    header = env.get_template(template_name + ".h").render(**jinja_context)
+    source = env.get_template(template_name + ".cpp").render(**jinja_context)
+
+    source_extension = "cpp" if config.target == Target.CPU else "cu"
+    ctx.write_file(f"{class_name}.h", header)
+    ctx.write_file(f"{class_name}.{source_extension}", source)
+
+
+def generate_mpidtype_info_from_kernel(ctx: CodeGenerationContext, class_name: str,
+                                       assignments: Sequence[Assignment], kind='pull', namespace='pystencils'):
+    assert kind in ('push', 'pull')
+    reads = set()
+    writes = set()
+
+    if isinstance(assignments, AssignmentCollection):
+        assignments = assignments.all_assignments
+
+    for a in assignments:
+        if not isinstance(a, Assignment):
+            continue
+        reads.update(a.rhs.atoms(Field.Access))
+        writes.update(a.lhs.atoms(Field.Access))
+
+    spec = defaultdict(set)
+    if kind == 'pull':
+        read_fields = set(fa.field for fa in reads)
+        assert len(read_fields) == 1, "Only scenarios where one fields neighbors are accessed"
+        field = read_fields.pop()
+        for fa in reads:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            comm_direction = inverse_direction(fa.offsets)
+            for comm_dir in _comm_directions(comm_direction):
+                assert len(fa.index) == 1, "Supports only fields with a single index dimension"
+                spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0])
+    elif kind == 'push':
+        written_fields = set(fa.field for fa in writes)
+        assert len(written_fields) == 1, "Only scenarios where one fields neighbors are accessed"
+        field = written_fields.pop()
+
+        for fa in writes:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            for comm_dir in _comm_directions(fa.offsets):
+                assert len(fa.index) == 1, "Supports only fields with a single index dimension"
+                spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0])
+    else:
+        raise ValueError("Invalid 'kind' parameter")
+
+    jinja_context = {
+        'class_name': class_name,
+        'namespace': namespace,
+        'kind': kind,
+        'field_name': field.name,
+        'f_size': field.index_shape[0],
+        'spec': spec,
+    }
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    header = env.get_template("MpiDtypeInfo.tmpl.h").render(**jinja_context)
+    ctx.write_file(f"{class_name}.h", header)
+
+
+# ---------------------------------- Internal --------------------------------------------------------------------------
+
+def _comm_directions(direction):
+    if all(e == 0 for e in direction):
+        yield direction
+    binary_numbers_list = binary_numbers(len(direction))
+    for comm_direction in binary_numbers_list:
+        for i in range(len(direction)):
+            if direction[i] == 0:
+                comm_direction[i] = 0
+            if direction[i] == -1 and comm_direction[i] == 1:
+                comm_direction[i] = -1
+        if not all(e == 0 for e in comm_direction):
+            yield tuple(comm_direction)
+
+
+def binary_numbers(n):
+    result = list()
+    for i in range(1 << n):
+        binary_number = bin(i)[2:]
+        binary_number = '0' * (n - len(binary_number)) + binary_number
+        result.append((list(map(int, binary_number))))
+    return result
diff --git a/python/pystencils_walberla/sweep.py b/python/pystencils_walberla/sweep.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf9a2a52b0de504394becdf99127a06f866383d
--- /dev/null
+++ b/python/pystencils_walberla/sweep.py
@@ -0,0 +1,199 @@
+from typing import Callable, Sequence
+
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils import Target, Assignment
+from pystencils import Field, create_kernel, create_staggered_kernel
+from pystencils.astnodes import KernelFunction
+
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec
+from pystencils_walberla.utility import config_from_context
+
+
+def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment],
+                   namespace: str = 'pystencils', field_swaps=(), staggered=False, varying_parameters=(),
+                   inner_outer_split=False, ghost_layers_to_include=0,
+                   target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None,
+                   **create_kernel_params):
+    """Generates a waLBerla sweep from a pystencils representation.
+
+    The constructor of the C++ sweep class expects all kernel parameters (fields and parameters) in alphabetical order.
+    Fields have to passed using BlockDataID's pointing to walberla fields
+
+    Args:
+        ctx: build system context filled with information from waLBerla's CMake. The context for example
+                            defines where to write generated files, if OpenMP is available or which SIMD instruction
+                            set should be used. See waLBerla examples on how to get a context.
+        class_name: name of the generated sweep class
+        assignments: list of assignments defining the stencil update rule or a :class:`KernelFunction`
+        namespace: the generated class is accessible as walberla::<namespace>::<class_name>
+        field_swaps: sequence of field pairs (field, temporary_field). The generated sweep only gets the first field
+                     as argument, creating a temporary field internally which is swapped with the first field after
+                     each iteration.
+        staggered: set to True to create staggered kernels with `pystencils.create_staggered_kernel`
+        varying_parameters: Depending on the configuration, the generated kernels may receive different arguments for
+                            different setups. To not have to adapt the C++ application when then parameter change,
+                            the varying_parameters sequence can contain parameter names, which are always expected by
+                            the C++ class constructor even if the kernel does not need them.
+        inner_outer_split: if True generate a sweep that supports separate iteration over inner and outer regions
+                           to allow for communication hiding.
+        ghost_layers_to_include: determines how many ghost layers should be included for the Sweep.
+                                 This is relevant if a setter kernel should also set correct values to the ghost layers.
+        target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target
+        data_type: default datatype for the kernel creation. Default is double
+        cpu_openmp: if loops should use openMP or not.
+        cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
+        max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    if staggered:
+        assert 'omp_single_loop' not in create_kernel_params
+        create_kernel_params['omp_single_loop'] = False
+    config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
+                                 cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params)
+
+    if isinstance(assignments, KernelFunction):
+        ast = assignments
+        target = ast.target
+    elif not staggered:
+        ast = create_kernel(assignments, config=config)
+    else:
+        # This should not be necessary but create_staggered_kernel does not take a config at the moment ...
+        ast = create_staggered_kernel(assignments, **config.__dict__)
+
+    ast.function_name = class_name.lower()
+
+    selection_tree = KernelCallNode(ast)
+    generate_selective_sweep(ctx, class_name, selection_tree, target=target, namespace=namespace,
+                             field_swaps=field_swaps, varying_parameters=varying_parameters,
+                             inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
+                             cpu_vectorize_info=config.cpu_vectorize_info,
+                             cpu_openmp=config.cpu_openmp, max_threads=max_threads)
+
+
+def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings=(), target=None,
+                             namespace='pystencils', field_swaps=(), varying_parameters=(),
+                             inner_outer_split=False, ghost_layers_to_include=0,
+                             cpu_vectorize_info=None, cpu_openmp=False, max_threads=None):
+    """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple
+    pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`.
+
+    Args:
+        ctx: see documentation of `generate_sweep`
+        class_name: name of the generated sweep class
+        selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree
+        interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of
+                            the selection tree
+        target: `None`, `Target.CPU` or `Target.GPU`; inferred from kernels if `None` is given.
+        namespace: see documentation of `generate_sweep`
+        field_swaps: see documentation of `generate_sweep`
+        varying_parameters: see documentation of `generate_sweep`
+        inner_outer_split: see documentation of `generate_sweep`
+        ghost_layers_to_include: see documentation of `generate_sweep`
+        cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels
+        cpu_openmp: Whether or not CPU kernels use OpenMP parallelization
+        max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
+    """
+    def to_name(f):
+        return f.name if isinstance(f, Field) else f
+
+    field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps)
+    temporary_fields = tuple(e[1] for e in field_swaps)
+
+    kernel_family = KernelFamily(selection_tree, class_name,
+                                 temporary_fields, field_swaps, varying_parameters)
+
+    if target is None:
+        target = kernel_family.get_ast_attr('target')
+    elif target != kernel_family.get_ast_attr('target'):
+        raise ValueError('Mismatch between target parameter and AST targets.')
+
+    if not ctx.gpu and target == Target.GPU:
+        return
+
+    representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter}
+    representative_field = sorted(representative_field)[0]
+
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings)
+
+    jinja_context = {
+        'kernel': kernel_family,
+        'namespace': namespace,
+        'class_name': class_name,
+        'target': target.name.lower(),
+        'field': representative_field,
+        'ghost_layers_to_include': ghost_layers_to_include,
+        'inner_outer_split': inner_outer_split,
+        'interface_spec': interface_spec,
+        'generate_functor': True,
+        'cpu_vectorize_info': cpu_vectorize_info,
+        'cpu_openmp': cpu_openmp,
+        'max_threads': max_threads
+    }
+    header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
+    source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
+
+    source_extension = "cpp" if target == Target.CPU else "cu"
+    ctx.write_file(f"{class_name}.h", header)
+    ctx.write_file(f"{class_name}.{source_extension}", source)
+
+
+def generate_sweep_collection(ctx, class_name: str, function_generators: Sequence[Callable], parameter_scaling=None):
+    """Generates a sweep collection
+    """
+
+    contexts_function_generators = list()
+    for fct in function_generators:
+        contexts_function_generators.append(fct())
+
+    namespaces = set([context['namespace'] for context in contexts_function_generators])
+    assert len(namespaces) == 1, "All function_generators must output the same namespace!"
+    namespace = namespaces.pop()
+
+    headers = set()
+    for context in contexts_function_generators:
+        for header in context['interface_spec'].headers:
+            headers.add(header)
+        for header in context['kernel'].get_headers():
+            headers.add(header)
+
+    kernel_list = list()
+    for context in contexts_function_generators:
+        kernel_list.append(context['kernel'])
+
+    kernels = list()
+    for context in contexts_function_generators:
+        kernels.append({
+            'kernel': context['kernel'],
+            'function_name': context['function_name'],
+            'ghost_layers_to_include': 'ghost_layers',
+            'field': context['field'],
+            'max_threads': context['max_threads']
+        })
+
+    target = kernels[0]['kernel'].target
+
+    jinja_context = {
+        'kernel_list': kernel_list,
+        'kernels': kernels,
+        'namespace': namespace,
+        'class_name': class_name,
+        'headers': headers,
+        'target': target.name.lower(),
+        'parameter_scaling': parameter_scaling,
+    }
+
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template("SweepCollection.tmpl.h").render(**jinja_context)
+    source = env.get_template("SweepCollection.tmpl.cpp").render(**jinja_context)
+
+    source_extension = "cpp" if target == Target.CPU else "cu"
+    ctx.write_file(f"{class_name}.h", header)
+    ctx.write_file(f"{class_name}.{source_extension}", source)
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.cpp b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
index a7b1c064ef752aff2a5b1edc435aedddc7d6d966..644202ba67cd574724e46ef2b42e60535dc2e5c6 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp
@@ -17,13 +17,11 @@
 //! \\author pystencils
 //======================================================================================================================
 
-#include <cmath>
-
 #include "core/DataTypes.h"
 #include "core/Macros.h"
 #include "{{class_name}}.h"
 {% if target == 'gpu' -%}
-#include "cuda/ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
 {%- endif %}
 
 
@@ -53,9 +51,9 @@ namespace {{namespace}} {
 #pragma diag_suppress 177
 #endif
 #endif
-
+//NOLINTBEGIN(readability-non-const-parameter*)
 {{kernel|generate_definitions(target)}}
-
+//NOLINTEND(readability-non-const-parameter*)
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
@@ -67,7 +65,7 @@ namespace {{namespace}} {
 
 void {{class_name}}::run_impl(
    {{- ["IBlock * block", "IndexVectors::Type type",
-        kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []]
+        kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []]
        | type_identifier_list -}}
 )
 {
@@ -85,26 +83,27 @@ void {{class_name}}::run_impl(
    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
 
    {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
+   {{kernel|generate_timestep_advancements|indent(4)}}
    {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }}
    {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
 }
 
 void {{class_name}}::run(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::ALL", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
 void {{class_name}}::inner(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::INNER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
 }
 
 void {{class_name}}::outer(
-   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
+   {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}}
 )
 {
    run_impl( {{- ["block", "IndexVectors::OUTER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index 03b403aeac1797acd4e429be140a8e35c63d4cb6..96a9202c19345f0e36c5e048be1ee65969f5c966 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -23,8 +23,9 @@
 {% if target is equalto 'cpu' -%}
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
 {%- endif %}
 #include "domain_decomposition/BlockDataID.h"
 #include "domain_decomposition/IBlock.h"
@@ -75,7 +76,7 @@ public:
         {% if target == 'gpu' -%}
         ~IndexVectors() {
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+               WALBERLA_GPU_CHECK(gpuFree( gpuVec ));
         }
         {% endif -%}
 
@@ -90,7 +91,7 @@ public:
         {
             {% if target == 'gpu' -%}
             for( auto & gpuVec: gpuVectors_)
-                cudaFree( gpuVec );
+               WALBERLA_GPU_CHECK(gpuFree( gpuVec ));
             gpuVectors_.resize( cpuVectors_.size() );
 
             WALBERLA_ASSERT_EQUAL(cpuVectors_.size(), NUM_TYPES);
@@ -98,8 +99,8 @@ public:
             {
                 auto & gpuVec = gpuVectors_[i];
                 auto & cpuVec = cpuVectors_[i];
-                cudaMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
-                cudaMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), cudaMemcpyHostToDevice );
+                WALBERLA_GPU_CHECK(gpuMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() ));
+                WALBERLA_GPU_CHECK(gpuMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), gpuMemcpyHostToDevice ));
             }
             {%- endif %}
         }
@@ -122,12 +123,12 @@ public:
     };
 
     void run (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     {% if generate_functor -%}
     void operator() (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     )
     {
         run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
@@ -135,28 +136,28 @@ public:
     {%- endif %}
 
     void inner (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
     void outer (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     );
 
-    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ]
                (IBlock * b)
                { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
                { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ]
                (IBlock * b)
@@ -238,10 +239,12 @@ public:
         {%endif%}
         {%else%}
         auto flagWithGLayers = flagField->xyzSizeWithGhostLayer();
+        {% if single_link %}
         {{dtype}} dot = 0.0; {{dtype}} maxn = 0.0;
         cell_idx_t calculated_idx = 0;
         cell_idx_t dx = 0; cell_idx_t dy = 0; {%if dim == 3%}  cell_idx_t dz = 0; {% endif %}
         cell_idx_t sum_x = 0; cell_idx_t sum_y = 0; {%if dim == 3%} cell_idx_t sum_z = 0; {%endif %}
+        {% endif -%}
         for( auto it = flagField->beginWithGhostLayerXYZ(); it != flagField->end(); ++it )
         {
             {% if single_link -%}
@@ -297,7 +300,7 @@ public:
 private:
     void run_impl(
         {{- ["IBlock * block", "IndexVectors::Type type",
-             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+             kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []]
             | type_identifier_list -}}
    );
 
diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
index d56ec573032eaddba9ba9b959883a864a3f3ce63..0191994f3f3a29ef9384b2a2270294be9df59f43 100644
--- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
+++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
@@ -1,3 +1,22 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author pystencils
+//======================================================================================================================
+
 #include "stencil/Directions.h"
 #include "core/cell/CellInterval.h"
 #include "core/DataTypes.h"
diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
index d25c04b2b782fe891de361356aa046554d32f1ae..66114de6ee87d58f37d08ef2e39251a2f1060717 100644
--- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
+++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
@@ -1,3 +1,22 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
 #pragma once
 #include "stencil/Directions.h"
 #include "core/cell/CellInterval.h"
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
index db79ae375bf8c815da1623abe6961aa727bc1ede..19b7b11ed507f8f068a3deb5908a1ca6fe867711 100644
--- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
@@ -1,9 +1,23 @@
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
-#include "core/DataTypes.h"
-#include "{{class_name}}.h"
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author pystencils
+//======================================================================================================================
 
+#include "{{class_name}}.h"
 
 {% if target is equalto 'cpu' -%}
 #define FUNC_PREFIX
@@ -29,7 +43,7 @@ using walberla::stencil::Direction;
 
 
 
-void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream)
 {
     {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
 
@@ -59,7 +73,7 @@ void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * b
 }
 
 
-void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, gpuStream_t stream)
 {
     {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
 
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
index 8b70e1cb8dce1898f8a5c955c59f810bc3353aa8..b301bced5b8bd159c028e6e75c26fd37df5a63b2 100644
--- a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
@@ -1,11 +1,34 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
 #pragma once
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
+
 #include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
 #include "domain_decomposition/IBlock.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
 
+#include "stencil/Directions.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
 
 {% if target is equalto 'cpu' -%}
 #define FUNC_PREFIX
@@ -25,7 +48,7 @@ namespace walberla {
 namespace {{namespace}} {
 
 
-class {{class_name}} : public ::walberla::cuda::GeneratedGPUPackInfo
+class {{class_name}} : public ::walberla::gpu::GeneratedGPUPackInfo
 {
 public:
     {{class_name}}( {{fused_kernel|generate_constructor_parameters(parameters_to_ignore=['buffer'])}} )
@@ -33,9 +56,13 @@ public:
     {};
     virtual ~{{class_name}}() {}
 
-    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
-    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
-    virtual uint_t size  (stencil::Direction dir, IBlock * block);
+    void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
+    void communicateLocal  ( stencil::Direction /*dir*/, const IBlock* /* sender */, IBlock* /* receiver */, gpuStream_t /* stream */ ) override
+    {
+       WALBERLA_ABORT("Local Communication not implemented yet for standard PackInfos. To run your application turn of local communication in the Communication class")
+    }
+    void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
+    uint_t size  (stencil::Direction dir, IBlock * block) override;
 
 private:
     {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}}
diff --git a/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
index 3f9cbb2e659f58eb0b6ae1ff7dcb0e5b1cf0a8e5..860ea49717b76efbe205698a1eb14ed3c0d71797 100644
--- a/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
+++ b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
@@ -1,3 +1,22 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
 #pragma once
 
 #include "core/debug/Debug.h"
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
index 96e589e1e549bd3ad28075e663ff399131dc4a33..8f3e14e59074a2f483fe14c5f85eb3e352c0a836 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -14,8 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \\file {{class_name}}.cpp
-//! \\ingroup lbm
-//! \\author lbmpy
+//! \\author pystencils
 //======================================================================================================================
 
 #include <cmath>
@@ -56,7 +55,7 @@ namespace {{namespace}} {
 
 {{kernel|generate_definitions(target, max_threads)}}
 
-void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
     {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }}
@@ -67,7 +66,7 @@ void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_paramet
 
 void {{class_name}}::runOnCellInterval(
     {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
-         kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] 
+         kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] 
         | type_identifier_list -}}
 )
 {
@@ -86,7 +85,7 @@ void {{class_name}}::runOnCellInterval(
 }
 
 {%if inner_outer_split%}
-void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
@@ -98,7 +97,7 @@ void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_param
 }
 
 
-void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
+void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {
     {{kernel|generate_block_data_to_field_extraction|indent(4)}}
 
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h
index 093e2332ac4e5925305701a4f0d54887f18dffe1..e0b773ab1b1ab656a8db81ae10459d01b84766a9 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.h
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.h
@@ -19,13 +19,15 @@
 
 #pragma once
 #include "core/DataTypes.h"
+#include "core/logging/Logging.h"
 
 {% if target is equalto 'cpu' -%}
 #include "field/GhostLayerField.h"
 {%- elif target is equalto 'gpu' -%}
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
 {% if inner_outer_split -%}
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
 {%- endif %}
 {%- endif %}
 #include "field/SwapableCompare.h"
@@ -65,17 +67,17 @@ public:
 
     {{ kernel| generate_destructor(class_name) |indent(4) }}
 
-    void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
     
     void runOnCellInterval(
         {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block",
-             kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] 
+             kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] 
             | type_identifier_list -}}
     );
 
     {% if generate_functor %}
     void operator() (
-        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
+        {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}}
     )
     {
         run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} );
@@ -91,14 +93,14 @@ public:
                { kernel->run( {{- [ ['b'], kernel.kernel_selection_parameters] | identifier_list -}} ); };
     }
 
-    static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, cudaStream_t {% endif -%} )> getSweepOnCellInterval(
+    static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, gpuStream_t {% endif -%} )> getSweepOnCellInterval(
         {{- ["const shared_ptr<" + class_name + "> & kernel", "const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
              kernel.kernel_selection_parameters, 'cell_idx_t ghostLayers=1']
             | type_identifier_list -}}
     )
     {
         return [ {{- ["kernel", "blocks", "globalCellInterval", "ghostLayers", kernel.kernel_selection_parameters] | identifier_list -}} ]
-               (IBlock * b{%- if target is equalto 'gpu'%}, cudaStream_t stream = nullptr{% endif -%}) 
+               (IBlock * b{%- if target is equalto 'gpu'%}, gpuStream_t stream = nullptr{% endif -%}) 
                { kernel->runOnCellInterval(
                     {{- ["blocks", "globalCellInterval", "ghostLayers", "b", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []]
                         | identifier_list 
@@ -106,7 +108,7 @@ public:
                 ); };
     }
 
-    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] 
                (IBlock * b) 
@@ -115,7 +117,7 @@ public:
 
     std::function<void (IBlock *)> getSweepOnCellInterval(
         {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval",
-             interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["cudaStream_t stream = nullptr"] if target == 'gpu' else []]
+             interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["gpuStream_t stream = nullptr"] if target == 'gpu' else []]
             | type_identifier_list -}}
     )
     {
@@ -125,18 +127,18 @@ public:
     }
 
 {% if inner_outer_split %}
-    void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
+    void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} );
 
-    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
                (IBlock * b) 
                { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); };
     }
 
-    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
+    std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
     {
         return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] 
                (IBlock * b) 
@@ -152,7 +154,7 @@ public:
     {{kernel|generate_members|indent(4)}}
 
 private:
-    {%if target is equalto 'gpu' -%} cuda::ParallelStreams parallelStreams_; {%- endif %}
+    {%if target is equalto 'gpu' -%} gpu::ParallelStreams parallelStreams_; {%- endif %}
 
     Cell outerWidth_;
     std::vector<CellInterval> layers_;
diff --git a/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp b/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a9a1c28434bff3d257ca2bf9c76bd4fa20d9f1db
--- /dev/null
+++ b/python/pystencils_walberla/templates/SweepCollection.tmpl.cpp
@@ -0,0 +1,69 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\author pystencils
+//======================================================================================================================
+#include "{{class_name}}.h"
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+{% for kernel in kernels %}
+{{kernel['kernel']|generate_definitions(target, kernel['max_threads'])}}
+{% endfor %}
+
+
+{% for kernel in kernels %}
+void {{class_name}}::{{kernel['function_name']}}( {{kernel['kernel']|generate_plain_parameter_list(ghost_layers=True)}} )
+{
+   {{kernel['kernel']|generate_call(ghost_layers_to_include=kernel['ghost_layers_to_include'], stream='stream')|indent(3)}}
+}
+void {{class_name}}::{{kernel['function_name']}}CellInterval( {{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci')}})
+{
+   {{kernel['kernel']|generate_call(stream='stream', cell_interval='ci')|indent(3)}}
+}
+{% endfor %}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning pop
+#endif
diff --git a/python/pystencils_walberla/templates/SweepCollection.tmpl.h b/python/pystencils_walberla/templates/SweepCollection.tmpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5db4ccb33457efcc2f9f9385d0f2b32db35aef5e
--- /dev/null
+++ b/python/pystencils_walberla/templates/SweepCollection.tmpl.h
@@ -0,0 +1,298 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+#include "core/Macros.h"
+
+{% if target is equalto 'gpu' -%}
+#include "gpu/GPUField.h"
+#include "gpu/ParallelStreams.h"
+{%- endif %}
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/SwapableCompare.h"
+#include "field/GhostLayerField.h"
+
+#include <set>
+#include <cmath>
+
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+using namespace std::placeholders;
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}}
+{
+public:
+  enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+   {{class_name}}(const shared_ptr< StructuredBlockStorage > & blocks, {{kernel_list|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
+     : blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth)
+   {
+      {{kernel_list|generate_constructor(parameter_registration=parameter_scaling) |indent(6)}}
+
+      for (auto& iBlock : *blocks)
+      {
+         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
+             int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
+             int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
+          WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+      }
+   };
+
+   {{ kernel_list| generate_destructor(class_name) |indent(4) }}
+
+   /*************************************************************************************
+   *                Internal Function Definitions with raw Pointer
+   *************************************************************************************/
+
+   {%- for kernel in kernels %}
+   static void {{kernel['function_name']}} ({{kernel['kernel']|generate_plain_parameter_list(ghost_layers=0, stream="nullptr")}});
+   static void {{kernel['function_name']}}CellInterval ({{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci', stream="nullptr")}});
+   {% endfor %}
+
+   /*************************************************************************************
+   *                Function Definitions for external Usage
+   *************************************************************************************/
+
+   {%- for kernel in kernels %}
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}()
+   {
+      return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", ] | type_identifier_list -}})
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+         case Type::OUTER:
+            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+         default:
+            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
+      }
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+         case Type::OUTER:
+            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+         default:
+            return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); };
+      }
+   }
+
+   {% if target is equalto 'gpu' -%}
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+         case Type::OUTER:
+            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+         default:
+            return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); };
+      }
+   }
+
+   std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+         case Type::OUTER:
+            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+         default:
+            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); };
+      }
+   }
+   {%- endif %}
+
+   void {{kernel['function_name']}}({{- ["IBlock * block",] | type_identifier_list -}})
+   {
+      const cell_idx_t ghost_layers = 0;
+      {% if target is equalto 'gpu' -%}
+      gpuStream_t gpuStream = nullptr;
+      {%- endif %}
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
+   {
+      {% if target is equalto 'gpu' -%}
+      gpuStream_t gpuStream = nullptr;
+      {%- endif %}
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   {% if target is equalto 'gpu' -%}
+   void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+   {%- endif %}
+
+   void {{kernel['function_name']}}CellInterval({{- ["IBlock * block", "const CellInterval & ci", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+      {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+      {{kernel['kernel']|generate_swaps|indent(6)}}
+   }
+
+   void {{kernel['function_name']}}Inner({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements(advance=False)|indent(6)}}
+
+      CellInterval inner = {{kernel['field']}}->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='inner')}});
+   }
+
+   void {{kernel['function_name']}}Outer({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
+   {
+
+      {{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
+      {{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
+      {{kernel['kernel']|generate_timestep_advancements|indent(6)}}
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         {{kernel['field']}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    {%if target is equalto 'gpu'%}
+      {
+         auto parallelSection_ = parallelStreams_.parallelSection( gpuStream );
+         for( auto & ci: layers_ )
+         {
+          parallelSection_.run([&]( auto s ) {
+             {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+          });
+         }
+      }
+    {% else %}
+      for( auto & ci: layers_ )
+      {
+         {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+      }
+    {% endif %}
+
+    {{kernel['kernel']|generate_swaps|indent(9)}}
+   }
+   {% endfor %}
+
+   {%if target is equalto 'gpu'%}
+   void setOuterPriority(int priority)
+   {
+      parallelStreams_.setStreamPriority(priority);
+   }
+   {%endif%}
+
+   private:
+      shared_ptr< StructuredBlockStorage > blocks_;
+      {{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}}
+
+      Cell outerWidth_;
+      std::vector<CellInterval> layers_;
+
+      {%if target is equalto 'gpu' -%}
+      gpu::ParallelStreams parallelStreams_;
+      // std::map<BlockID, gpuStream_t > streams_;
+      {%- endif %}
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py
index c109265ef3e5b0f16ff8f9c276394422d096097f..f19a0997497e9659a8c37cb81ba0db85472e7b22 100644
--- a/python/pystencils_walberla/utility.py
+++ b/python/pystencils_walberla/utility.py
@@ -1,9 +1,17 @@
 from os import path
-from pystencils.typing import get_base_type
-from pystencils_walberla.cmake_integration import CodeGenerationContext
+from functools import reduce
+from typing import Union, Dict, DefaultDict
+import warnings
+
+from pystencils import CreateKernelConfig, Target
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.boundaries.createindexlist import boundary_index_array_coordinate_names, direction_member_name
+from pystencils.typing import BasicType, create_type, get_base_type
 
 from lbmpy import LBStencil
 
+from pystencils_walberla.cmake_integration import CodeGenerationContext
+
 HEADER_EXTENSIONS = {'.h', '.hpp'}
 
 
@@ -59,6 +67,145 @@ def generate_info_header(ctx: CodeGenerationContext,
     ctx.write_file(filename, lines + additional_code)
 
 
+def get_vectorize_instruction_set(ctx: CodeGenerationContext):
+    """returns a list of supported vector instruction sets. If waLBerla is not build with
+       `WALBERLA_OPTIMIZE_FOR_LOCALHOST` `None` is returned.
+
+    Args:
+        ctx: Code Generation Context
+    """
+
+    if ctx.optimize_for_localhost:
+        supported_instruction_sets = get_supported_instruction_sets()
+        if supported_instruction_sets:
+            return supported_instruction_sets[-1]
+        else:  # if cpuinfo package is not installed
+            warnings.warn("Could not obtain supported vectorization instruction sets - defaulting to sse. "
+                          "This problem can probably be fixed by installing py-cpuinfo. This package can "
+                          "gather the needed hardware information.")
+            return 'sse'
+    else:
+        return None
+
+
+def config_from_context(ctx: CodeGenerationContext, target: Target = Target.CPU,
+                        data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = None,
+                        cpu_openmp: Union[bool, int] = None, cpu_vectorize_info: Dict = None,
+                        **kwargs) -> CreateKernelConfig:
+    """Creates a :class: `pystencils.config.CreateKernelConfig` from the code generation context. By default,
+       all arguments are determined by the generation context. This means for example if `DWALBERLA_BUILD_WITH_GPU_SUPPORT` is
+       `True` the kernel will be generated for GPU using either CUDA or HIP.
+
+    Args:
+        ctx: Code Generation Context
+        target: All targets are defined in :class:`pystencils.enums.Target`
+        data_type: Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to
+                   type. If specified as a dict ideally a defaultdict is used to define a default value for symbols
+                   not listed in the dict. If a plain dict is provided it will be transformed into a defaultdict
+                   internally. The default value will then be specified via type collation then.
+        cpu_openmp: `True` or number of threads for OpenMP parallelization, `False` for no OpenMP.
+                     If set to `True`, the maximum number of available threads will be chosen.
+        cpu_vectorize_info: A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
+                            for documentation of these parameters see vectorize function. Example:
+                            '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
+        kwargs: keyword arguments that can be taken by :class: `pystencils.config.CreateKernelConfig`
+    """
+
+    if target == Target.GPU and not ctx.gpu:
+        raise ValueError("can not generate gpu code if waLBerla is not build with GPU support. Please use "
+                         "-DWALBERLA_BUILD_WITH_CUDA=1 or -DWALBERLA_BUILD_WITH_HIP=1 for configuring cmake")
+
+    default_dtype = "float64" if ctx.double_accuracy else "float32"
+    if data_type is None:
+        data_type = default_dtype
+
+    if cpu_openmp and not ctx.openmp:
+        warnings.warn("Code is generated with OpenMP pragmas but waLBerla is not build with OpenMP. "
+                      "The compilation might not work due to wrong compiler flags. "
+                      "Please use -DWALBERLA_BUILD_WITH_OPENMP=1 for configuring cmake")
+
+    if cpu_openmp is None:
+        cpu_openmp = ctx.openmp
+
+    if cpu_vectorize_info is None:
+        cpu_vectorize_info = {}
+
+    default_vec_is = get_vectorize_instruction_set(ctx)
+
+    cpu_vectorize_info['instruction_set'] = cpu_vectorize_info.get('instruction_set', default_vec_is)
+    cpu_vectorize_info['assume_inner_stride_one'] = cpu_vectorize_info.get('assume_inner_stride_one', True)
+    cpu_vectorize_info['assume_aligned'] = cpu_vectorize_info.get('assume_aligned', False)
+    cpu_vectorize_info['nontemporal'] = cpu_vectorize_info.get('nontemporal', False)
+    cpu_vectorize_info['assume_sufficient_line_padding'] = cpu_vectorize_info.get('assume_sufficient_line_padding',
+                                                                                  False)
+
+    config = CreateKernelConfig(target=target, data_type=data_type, default_number_float=data_type,
+                                cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info,
+                                **kwargs)
+
+    return config
+
+
+def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None):
+    if identity_check_key is None:
+        identity_check_key = sort_key
+    nx = len(lx)
+    ny = len(ly)
+
+    def recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern):
+        if ix_intern == nx:
+            return ly_intern[iy_intern:]
+        if iy_intern == ny:
+            return lx_intern[ix_intern:]
+        x = lx_intern[ix_intern]
+        y = ly_intern[iy_intern]
+        skx = sort_key(x)
+        sky = sort_key(y)
+        if skx == sky:
+            if identity_check_key(x) == identity_check_key(y):
+                return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern + 1)
+            else:
+                raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!')
+        elif skx < sky:
+            return [x] + recursive_merge(lx_intern, ly_intern, ix_intern + 1, iy_intern)
+        else:
+            return [y] + recursive_merge(lx_intern, ly_intern, ix_intern, iy_intern + 1)
+    return recursive_merge(lx, ly, 0, 0)
+
+
+def merge_lists_of_symbols(lists):
+    def merger(lx, ly):
+        return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol)
+    return reduce(merger, lists)
+
+
+def struct_from_numpy_dtype(struct_name, numpy_dtype):
+    result = f"struct {struct_name} {{ \n"
+
+    equality_compare = []
+    constructor_params = []
+    constructor_initializer_list = []
+    for name, (sub_type, offset) in numpy_dtype.fields.items():
+        pystencils_type = create_type(sub_type)
+        result += f"    {pystencils_type} {name};\n"
+        if name in boundary_index_array_coordinate_names or name == direction_member_name:
+            constructor_params.append(f"{pystencils_type} {name}_")
+            constructor_initializer_list.append(f"{name}({name}_)")
+        else:
+            constructor_initializer_list.append(f"{name}()")
+        if pystencils_type.is_float():
+            equality_compare.append(f"floatIsEqual({name}, o.{name})")
+        else:
+            equality_compare.append(f"{name} == o.{name}")
+
+    result += "    %s(%s) : %s {}\n" % \
+              (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list))
+    result += "    bool operator==(const %s & o) const {\n        return %s;\n    }\n" % \
+              (struct_name, " && ".join(equality_compare))
+    result += "};\n"
+    return result
+
+
 #   ------------------------------------- INTERNAL -------------------------------------------------------------
 
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e5ca7b0c84c43f0d46a1dd9722bd6cdfe1d1f65e..d49a1e63bbcc36307d83ce80e1440048b2ed8207 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,12 +24,10 @@ add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( communication )
 add_subdirectory( core )
-if ( CMAKE_CUDA_COMPILER )
-   add_subdirectory( cuda )
-endif()
+add_subdirectory(gpu)
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
-if ( FFTW3_FOUND )
+if ( WALBERLA_BUILD_WITH_FFT AND FFTW3_FOUND )
    add_subdirectory( fft )
 endif()
 add_subdirectory( field )
@@ -37,6 +35,7 @@ add_subdirectory( gather )
 add_subdirectory( geometry )
 add_subdirectory( gui )
 add_subdirectory( lbm )
+add_subdirectory( lbm_generated )
 add_subdirectory( lbm_mesapd_coupling )
 add_subdirectory( mesa_pd )
 if( OPENMESH_FOUND )
diff --git a/src/blockforest/Block.h b/src/blockforest/Block.h
index 64c7dafa70efecb428807de242ddf165e4417023..a61de6ac5c898c0ea1f8bb6a28f0a7b7f33fe002 100644
--- a/src/blockforest/Block.h
+++ b/src/blockforest/Block.h
@@ -270,21 +270,21 @@ inline bool Block::neighborhoodSectionHasSmallerBlocks( const uint_t sectionInde
 {
    WALBERLA_ASSERT_LESS( sectionIndex, 26 );
 
-   return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() > id_.getUsedBits();
+   return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() > id_.getUsedBits();
 }
 
 inline bool Block::neighborhoodSectionHasEquallySizedBlock( const uint_t sectionIndex ) const
 {
    WALBERLA_ASSERT_LESS( sectionIndex, 26 );
 
-   return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() == id_.getUsedBits();
+   return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() == id_.getUsedBits();
 }
 
 inline bool Block::neighborhoodSectionHasLargerBlock( const uint_t sectionIndex ) const
 {
    WALBERLA_ASSERT_LESS( sectionIndex, 26 );
 
-   return !neighborhoodSection_[sectionIndex].empty() && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() < id_.getUsedBits();
+   return neighborhoodSectionHasBlocks(sectionIndex) && neighborhoodSection_[sectionIndex][0]->id_.getUsedBits() < id_.getUsedBits();
 }
 
 
diff --git a/src/blockforest/BlockDataHandling.h b/src/blockforest/BlockDataHandling.h
index 7f56467c06b9eebf033753847020843a14a264c3..71e0138be0dd8c3b5af16769c32ffc3ac6e9f386 100644
--- a/src/blockforest/BlockDataHandling.h
+++ b/src/blockforest/BlockDataHandling.h
@@ -122,65 +122,65 @@ public:
    
    BlockData * initialize( IBlock * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->initialize( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->serialize( block, id, buffer );
    }
    
    void serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->serializeCoarseToFine( block, id, buffer, child );
    }
    
    void serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->serializeFineToCoarse( block, id, buffer );
    }
    
    BlockData * deserialize( IBlock * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->deserialize( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    BlockData * deserializeCoarseToFine( Block * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->deserializeCoarseToFine( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    BlockData * deserializeFineToCoarse( Block * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->deserializeFineToCoarse( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->deserialize( block, id, buffer );
    }
    
    void deserializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->deserializeCoarseToFine( block, id, buffer );
    }   
    
    void deserializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer, const uint_t child ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->deserializeFineToCoarse( block, id, buffer, child );
    }
    
diff --git a/src/blockforest/SetupBlockForest.cpp b/src/blockforest/SetupBlockForest.cpp
index 9dcdf161345f6aeddfb119f653ca023fa5e282e0..b8d454d45c9d1ddc921343e98bca41db609642eb 100644
--- a/src/blockforest/SetupBlockForest.cpp
+++ b/src/blockforest/SetupBlockForest.cpp
@@ -1205,7 +1205,7 @@ void SetupBlockForest::balanceLoad( const TargetProcessAssignmentFunction & func
                                     const memory_t perProcessMemoryLimit,
                                     const bool reorderProcessesByBFS, const bool insertBufferProcesses )
 {
-   WALBERLA_LOG_PROGRESS( "Balancing SetupBlockForest: Creating a process distribution for " << numberOfProcesses_ << " process(es) ..." )
+   WALBERLA_LOG_PROGRESS( "Balancing SetupBlockForest: Creating a process distribution for " << numberOfProcesses << " process(es) ..." )
 
    if( minBufferProcessesFraction < real_t(0) || minBufferProcessesFraction >= real_t(1))
       WALBERLA_ABORT( "Load balancing failed: \'buffer processes fraction\' must be in [0,1). "
@@ -1235,7 +1235,7 @@ void SetupBlockForest::balanceLoad( const TargetProcessAssignmentFunction & func
                                     const memory_t perProcessMemoryLimit,
                                     const bool reorderProcessesByBFS, const bool insertBufferProcesses )
 {
-   WALBERLA_LOG_PROGRESS( "Balancing SetupBlockForest: Creating a process distribution for " << numberOfProcesses_ << " process(es) ..." )
+   WALBERLA_LOG_PROGRESS( "Balancing SetupBlockForest: Creating a process distribution for " << numberOfProcesses << " process(es) ..." )
    
    balanceLoadHelper( function, numberOfProcesses, numberOfBufferProcesses, perProcessMemoryLimit, reorderProcessesByBFS, insertBufferProcesses );
 }
diff --git a/src/blockforest/communication/NonUniformBufferedScheme.h b/src/blockforest/communication/NonUniformBufferedScheme.h
index caf91651c578ddc7da5bea5b8a67398e8cc590ee..be27a51ec805285144983d2d3a3618c502596d50 100644
--- a/src/blockforest/communication/NonUniformBufferedScheme.h
+++ b/src/blockforest/communication/NonUniformBufferedScheme.h
@@ -65,10 +65,10 @@ public:
    //**Construction & Destruction***************************************************************************************
    /*! \name Construction & Destruction */
    //@{
-   explicit NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf,
+   explicit NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf,
                                       const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
 
-   NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf,
+   NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf,
                              const Set<SUID> & requiredBlockSelectors, 
                              const Set<SUID> & incompatibleBlockSelectors,
                              const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
@@ -96,6 +96,16 @@ public:
    inline void communicateEqualLevel  ( const uint_t level );
    inline void communicateCoarseToFine( const uint_t fineLevel );
    inline void communicateFineToCoarse( const uint_t fineLevel );
+
+   std::function<void()>  communicateEqualLevelFunctor(const uint_t level) {
+      return [level, this](){ NonUniformBufferedScheme::communicateEqualLevel(level);};
+   }
+   std::function<void()>  communicateCoarseToFineFunctor(const uint_t fineLevel) {
+      return [fineLevel, this](){ NonUniformBufferedScheme::communicateCoarseToFine(fineLevel);};
+   }
+   std::function<void()>  communicateFineToCoarseFunctor(const uint_t fineLevel) {
+      return [fineLevel, this](){ NonUniformBufferedScheme::communicateFineToCoarse(fineLevel);};
+   }
    //@}
    //*******************************************************************************************************************
    
@@ -190,7 +200,7 @@ protected:
 
 
 template< typename Stencil >
-NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf, const int baseTag )
+NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const int baseTag )
    : blockForest_( bf ), localMode_( START ), baseTag_( baseTag ),
      requiredBlockSelectors_( Set<SUID>::emptySet() ), incompatibleBlockSelectors_( Set<SUID>::emptySet() )
 {
@@ -200,7 +210,7 @@ NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<Structured
 
 
 template< typename Stencil >
-NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( weak_ptr<StructuredBlockForest> bf,
+NonUniformBufferedScheme<Stencil>::NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf,
                                                              const Set<SUID> & requiredBlockSelectors, 
                                                              const Set<SUID> & incompatibleBlockSelectors,
                                                              const int baseTag /*= 778*/ ) // waLBerla = 119+97+76+66+101+114+108+97
@@ -236,10 +246,10 @@ void NonUniformBufferedScheme<Stencil>::init()
 template< typename Stencil >
 void NonUniformBufferedScheme<Stencil>::refresh()
 {
-   WALBERLA_ASSERT( !isAnyCommunicationInProgress() );
+   WALBERLA_ASSERT( !isAnyCommunicationInProgress() )
 
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levels = forest->getNumberOfLevels();
 
    for( uint_t i = 0; i != 3; ++i )
@@ -296,7 +306,7 @@ inline void NonUniformBufferedScheme<Stencil>::addPackInfo( const PackInfo & pac
 {
    if( isAnyCommunicationInProgress() )
    {
-      WALBERLA_ABORT( "You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!" );
+      WALBERLA_ABORT( "You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!" )
    }
 
    packInfos_.push_back( packInfo );
@@ -381,7 +391,7 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateEqualLevel()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
 
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
@@ -400,7 +410,7 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
 
    if( levelIndex == 1 )
@@ -421,7 +431,7 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
    
    if( levelIndex == 1 )
@@ -442,8 +452,8 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateEqualLevel( const uint_t level )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() )
 
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
       refresh();
@@ -460,9 +470,9 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine( const uint_t fineLevel )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) );
-   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) )
+   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() )
 
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
       refresh();
@@ -479,9 +489,9 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse( const uint_t fineLevel )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) );
-   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) )
+   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() )
 
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
       refresh();
@@ -498,10 +508,10 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateEqualLevel()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
-   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    wait( EQUAL_LEVEL, levelIndex );
 }
@@ -512,10 +522,10 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateCoarseToFine()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
-   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    if( levelIndex == 1 )
       return;
@@ -529,10 +539,10 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateFineToCoarse()
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
    const uint_t levelIndex = forest->getNumberOfLevels();
-   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_ASSERT_EQUAL( levelIndex, bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    if( levelIndex == 1 )
       return;
@@ -546,10 +556,10 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateEqualLevel  ( const uint_t level )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() );
-   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_LESS( level, forest->getNumberOfLevels() )
+   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[EQUAL_LEVEL].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    wait( EQUAL_LEVEL, level );
 }
@@ -560,11 +570,11 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateCoarseToFine( const uint_t fineLevel )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) );
-   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() );
-   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) )
+   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() )
+   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[COARSE_TO_FINE].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    wait( COARSE_TO_FINE, fineLevel );
 }
@@ -575,11 +585,11 @@ template< typename Stencil >
 inline void NonUniformBufferedScheme<Stencil>::waitCommunicateFineToCoarse( const uint_t fineLevel )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
-   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) );
-   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() );
-   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) );
-   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
+   WALBERLA_ASSERT_GREATER( fineLevel, uint_t(0) )
+   WALBERLA_ASSERT_LESS( fineLevel, forest->getNumberOfLevels() )
+   WALBERLA_ASSERT_EQUAL( forest->getNumberOfLevels(), bufferSystem_[FINE_TO_COARSE].size() - uint_t(1) )
+   WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() )
 
    wait( FINE_TO_COARSE, fineLevel );
 }
@@ -619,7 +629,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
       std::map< uint_t, std::vector< SendBufferFunction > > sendFunctions;
 
       auto forest = blockForest_.lock();
-      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
 
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
@@ -638,7 +648,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
             if( !( block->neighborhoodSectionHasEquallySizedBlock(neighborIdx) ) )
                continue;
 
-            WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) );
+            WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) )
 
             const BlockID & receiverId = block->getNeighborId( neighborIdx, uint_t(0) );
 
@@ -648,13 +658,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
             if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
             {
                auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
-               WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() );
+               WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
                for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
                {
                   if( localMode_ == BUFFER )
                   {
-                     SendBuffer buffer;
+                     SendBuffer const buffer;
                      localBuffers.push_back( buffer );
                      const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
@@ -745,7 +755,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
       std::set< uint_t > ranksToReceiveFrom;
 
       auto forest = blockForest_.lock();
-      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
          Block * block = dynamic_cast< Block * >( it.get() );
@@ -774,13 +784,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                   if( block->neighborExistsLocally( neighborIdx, n ) )
                   {
                      auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
-                     WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() );
+                     WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
                      for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
                      {
                         if( localMode_ == BUFFER )
                         {
-                           SendBuffer buffer;
+                           SendBuffer const buffer;
                            localBuffers.push_back( buffer );
                            const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
@@ -829,7 +839,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
                if( block->neighborhoodSectionHasLargerBlock(neighborIdx) )
                {
-                  WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) );
+                  WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) )
                   if( block->neighborExistsRemotely( neighborIdx, uint_t(0) ) &&
                       selectable::isSetSelected( block->getNeighborState( neighborIdx, 0 ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
                   {
@@ -890,7 +900,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
       std::set< uint_t > ranksToReceiveFrom;
 
       auto forest = blockForest_.lock();
-      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+      WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
 
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
@@ -910,7 +920,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                if( !( block->neighborhoodSectionHasLargerBlock(neighborIdx) ) )
                   continue;
 
-               WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) );
+               WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize(neighborIdx), uint_t(1) )
 
                const BlockID & receiverId = block->getNeighborId( neighborIdx, uint_t(0) );
 
@@ -920,13 +930,13 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
                {
                   auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
-                  WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() );
+                  WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
                   for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
                   {
                      if( localMode_ == BUFFER )
                      {
-                        SendBuffer buffer;
+                        SendBuffer const buffer;
                         localBuffers.push_back( buffer );
                         const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
@@ -1144,7 +1154,7 @@ template< typename Stencil >
 void NonUniformBufferedScheme<Stencil>::receive( RecvBuffer & buffer )
 {
    auto forest = blockForest_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
 
    while( !buffer.isEmpty() )
    {
@@ -1183,7 +1193,7 @@ template< typename Stencil >
 void NonUniformBufferedScheme<Stencil>::localBufferPacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo,
                                                             const Block * sender, const Block * receiver, const stencil::Direction & dir )
 {
-   WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() );
+   WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() )
 
    SendBuffer & buffer = localBuffers_[i][j][ bufferIndex ];
    buffer.clear();
@@ -1198,7 +1208,7 @@ void NonUniformBufferedScheme<Stencil>::localBufferPacking( const INDEX i, const
    }
    else
    {
-      WALBERLA_ASSERT( i == FINE_TO_COARSE );
+      WALBERLA_ASSERT( i == FINE_TO_COARSE )
       packInfo->packDataFineToCoarse( sender, receiver->getId(), dir, buffer );
    }
 }
@@ -1209,7 +1219,7 @@ template< typename Stencil >
 void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo,
                                                               Block * receiver, const Block * sender, const stencil::Direction & dir )
 {
-   WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() );
+   WALBERLA_ASSERT_LESS( bufferIndex, localBuffers_[i][j].size() )
 
    SendBuffer & sendBuffer = localBuffers_[i][j][ bufferIndex ];
    RecvBuffer recvBuffer( sendBuffer );
@@ -1224,7 +1234,7 @@ void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, con
    }
    else
    {
-      WALBERLA_ASSERT( i == FINE_TO_COARSE );
+      WALBERLA_ASSERT( i == FINE_TO_COARSE )
       packInfo->unpackDataFineToCoarse( receiver, sender->getId(), stencil::inverseDir[dir], recvBuffer );
    }
 }
diff --git a/src/blockforest/communication/NonUniformPackInfo.h b/src/blockforest/communication/NonUniformPackInfo.h
index 0b32369c654e4ca9642d88f5d85763f880b7e55d..73c3f760fbfb54b3af1be35fdd2d633e3495269e 100644
--- a/src/blockforest/communication/NonUniformPackInfo.h
+++ b/src/blockforest/communication/NonUniformPackInfo.h
@@ -106,13 +106,13 @@ protected:
 inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const
 {
 #ifndef NDEBUG
-   size_t sizeBefore = buffer.size();
+   size_t const sizeBefore = buffer.size();
 #endif
 
    packDataEqualLevelImpl( sender, dir, buffer );
 
 #ifndef NDEBUG
-   size_t sizeAfter = buffer.size();
+   size_t const sizeAfter = buffer.size();
    if( constantDataExchange() )
    {
 #ifdef _OPENMP
@@ -125,7 +125,7 @@ inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stenci
       if( dirEntry == sizeMap.end() )
          sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore;
       else
-         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) );
+         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) )
 #ifdef _OPENMP
       }
 #endif
@@ -138,13 +138,13 @@ inline void NonUniformPackInfo::packDataEqualLevel( const Block * sender, stenci
 inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const
 {
 #ifndef NDEBUG
-   size_t sizeBefore = buffer.size();
+   size_t const sizeBefore = buffer.size();
 #endif
 
    packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer );
 
 #ifndef NDEBUG
-   size_t sizeAfter = buffer.size();
+   size_t const sizeAfter = buffer.size();
    if( constantDataExchange() )
    {
 #ifdef _OPENMP
@@ -157,7 +157,7 @@ inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender
       if( dirEntry == sizeMap.end() )
          sizeMap[ fineReceiver.getBranchId() ] = sizeAfter - sizeBefore;
       else
-         WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) );
+         WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) )
 #ifdef _OPENMP
       }
 #endif
@@ -170,13 +170,13 @@ inline void NonUniformPackInfo::packDataCoarseToFine( const Block * coarseSender
 inline void NonUniformPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, mpi::SendBuffer & buffer ) const
 {
 #ifndef NDEBUG
-   size_t sizeBefore = buffer.size();
+   size_t const sizeBefore = buffer.size();
 #endif
 
    packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer );
 
 #ifndef NDEBUG
-   size_t sizeAfter = buffer.size();
+   size_t const sizeAfter = buffer.size();
    if( constantDataExchange() )
    {
 #ifdef _OPENMP
@@ -189,7 +189,7 @@ inline void NonUniformPackInfo::packDataFineToCoarse( const Block * fineSender,
       if( dirEntry == sizeMap.end() )
          sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore;
       else
-         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) );
+         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) )
 #ifdef _OPENMP
       }
 #endif
diff --git a/src/blockforest/communication/UniformBufferedScheme.h b/src/blockforest/communication/UniformBufferedScheme.h
index 8677b5f83afe7c4ba8bb047fbd95724bc7ce3ac3..7bc813cc5067a8d4336bc376dc643314defa1816 100644
--- a/src/blockforest/communication/UniformBufferedScheme.h
+++ b/src/blockforest/communication/UniformBufferedScheme.h
@@ -314,7 +314,7 @@ void UniformBufferedScheme<Stencil>::startCommunication()
                {
                   if( localMode_ == BUFFER )
                   {
-                     SendBuffer buffer;
+                     SendBuffer const buffer;
                      localBuffers_.push_back( buffer );
                      const uint_t index = uint_c( localBuffers_.size() ) - uint_t(1);
 
diff --git a/src/communication/UniformPackInfo.h b/src/communication/UniformPackInfo.h
index 5ec6db29d32dff36713ab903498048b450f748f2..aa110f9bdf5c51b37a57572cfbc800b004ab37b6 100644
--- a/src/communication/UniformPackInfo.h
+++ b/src/communication/UniformPackInfo.h
@@ -153,13 +153,13 @@ protected:
 inline void UniformPackInfo::packData( const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & buffer ) const
 {
 #ifndef NDEBUG
-   size_t sizeBefore = buffer.size();
+   size_t const sizeBefore = buffer.size();
 #endif
 
    packDataImpl( sender, dir, buffer );
 
 #ifndef NDEBUG
-   size_t sizeAfter = buffer.size();
+   size_t const sizeAfter = buffer.size();
    if( constantDataExchange() )
    {
 #ifdef _OPENMP
@@ -171,7 +171,7 @@ inline void UniformPackInfo::packData( const IBlock * sender, stencil::Direction
       if( dirEntry == blockMap.end() )
          blockMap[ dir ] = sizeAfter - sizeBefore;
       else
-         WALBERLA_ASSERT_EQUAL( blockMap[ dir ], (sizeAfter - sizeBefore) );
+         WALBERLA_ASSERT_EQUAL( blockMap[ dir ], (sizeAfter - sizeBefore) )
 #ifdef _OPENMP
       }
 #endif
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index f3a6dc20b97c034513391b21528648f51b394299..099e0b5732a9d5b112600d150162f728bf8dcb8f 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -57,8 +57,6 @@ target_sources( core
          Hostname.h
          Macros.h
          MemoryUsage.h
-         MultiArrayIO.h
-         MultiArrayIO.impl.h
          NonCopyable.h
          NonCreateable.h
          OpenMP.h
diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h
index 2f868719787ae5d6930cb55581bfc3df1298d4e5..bae5b7651eaa17bc67c9fe822eeb386de38f61ca 100644
--- a/src/core/DataTypes.h
+++ b/src/core/DataTypes.h
@@ -167,6 +167,33 @@ using real_t = double;
 using real_t = float;
 #endif
 
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+///
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+#   if defined(WALBERLA_CXX_COMPILER_IS_CLANG) || defined(WALBERLA_CXX_COMPILER_IS_GNU)
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+using half    = _Float16;
+using float16 = half;
+#   else
+static_assert(false, "\n\n### Attempting to built walberla with half precision support.\n"
+                     "### However, the compiler you chose is not suited for that, or we simply have not implemented "
+                     "support for half precision and your compiler.\n");
+#   endif
+#endif
+using float32 = float;
+using float64 = double;
+
 inline constexpr real_t operator"" _r( long double t ) { return static_cast< real_t >(t); }
 inline constexpr real_t operator"" _r( unsigned long long int t ) { return static_cast< real_t >(t); }
 template< typename T > inline real_t real_c  ( T t ) { return numeric_cast< real_t >(t); } ///< cast to type real_t using "real_c(x)"
diff --git a/src/core/MultiArrayIO.h b/src/core/MultiArrayIO.h
deleted file mode 100644
index 097fb1c99ff0a2879c3b44b3184cee3d5cdf3428..0000000000000000000000000000000000000000
--- a/src/core/MultiArrayIO.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file MultiArrayIO.h
-//! \ingroup config
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning( disable : 4189 )
-#  pragma warning( disable : 4100 )
-#  pragma warning( disable : 4458 )
-#  pragma warning( disable : 4459 )
-#  pragma warning( disable : 4510 )
-#  pragma warning( disable : 4610 )
-#endif //_MSC_VER
-
-#include <boost/multi_array.hpp>
-
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif //_MSC_VER
-
-
-namespace boost {
-
-   // 1D Arrays
-   template< typename T>
-   std::istream & operator>> ( std::istream & is, boost::multi_array<T,1> & arr );
-
-   template<typename T>
-   std::ostream & operator<< ( std::ostream & os, const boost::multi_array<T,1> & arr );
-
-
-
-   // 2D Arrays
-   template<typename T>
-   std::istream & operator>> ( std::istream & is, boost::multi_array<T,2> & arr );
-
-   template<typename T>
-   std::ostream & operator<< ( std::ostream & os, const boost::multi_array<T,2> & arr );
-
-
-
-} // namespace walberla
-
-
-#include "MultiArrayIO.impl.h"
diff --git a/src/core/MultiArrayIO.impl.h b/src/core/MultiArrayIO.impl.h
deleted file mode 100644
index c9e671654aeaaf1bca3eb8d957e6d19098bde137..0000000000000000000000000000000000000000
--- a/src/core/MultiArrayIO.impl.h
+++ /dev/null
@@ -1,231 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file MultiArrayIO.h
-//! \ingroup config
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "core/DataTypes.h"
-#include "core/StringUtility.h"
-
-#include <cstddef>
-#include <sstream>
-
-namespace boost {
-
-//===================================================================================================================
-//
-//  Helper Functions
-//
-//===================================================================================================================
-
-
-template<typename T>
-bool parseArray1D( std::vector<T> & arr, std::istream & is,
-                   const char openingBracket='[',
-                   const char closingBracket=']',
-                   const std::string & delimiter = ", \t\n" )
-{
-
-   is >> std::skipws;
-
-   char bracket1;
-   if( !(is >> bracket1 ) || bracket1 != openingBracket )
-      return false;
-
-   std::string line;
-   if ( ! std::getline( is, line, closingBracket ) )
-      return false;
-
-
-   std::vector<std::string> stringArr = walberla::string_split( line, delimiter );
-
-   arr.clear();
-   arr.reserve( stringArr.size() );
-   for( auto sArrIt =stringArr.begin(); sArrIt != stringArr.end(); ++sArrIt )
-   {
-      if ( *sArrIt == "") continue;
-
-      std::stringstream ss ( *sArrIt );
-      T value;
-      ss >> value;
-      arr.push_back( value );
-   }
-   return true;
-}
-
-
-
-template<typename T >
-bool parseArray2D( std::vector< std::vector<T> > & arr, std::istream & is,
-                   const char openingBracket='[',
-                   const char closingBracket=']',
-                   const std::string & delimiter = ", \t\n"  )
-{
-   is >> std::skipws;
-   char bracket1;
-   if( !(is >> bracket1 ) || bracket1 != openingBracket )
-      return false;
-
-   do
-   {
-      arr.push_back( std::vector<T>() );
-      if ( ! parseArray1D( arr.back(), is, openingBracket, closingBracket, delimiter ) )
-         return false;
-
-      while( delimiter.find( (char)( is.peek()) ) != std::string::npos )
-         is.get();
-
-      if ( is.peek() == closingBracket ) {
-         is.get();
-         return true;
-      }
-
-   } while ( true );
-
-}
-
-
-//===================================================================================================================
-//
-//  IO Operators
-//
-//===================================================================================================================
-
-
-template<typename T>
-std::istream & operator>> ( std::istream & is, boost::multi_array<T,1> & arr )
-{
-   if ( !is ) return is;
-
-   const std::istream::pos_type pos( is.tellg() );
-   const std::istream::fmtflags oldFlags( is.flags() );
-
-
-   std::vector< T > vec;
-   if ( ! parseArray1D( vec, is ) ) {
-      is.clear();
-      is.seekg( pos );
-      is.setstate( std::istream::failbit );
-      is.flags( oldFlags );
-      return is;
-   }
-
-   auto rows = vec.size();
-   if ( rows == 0 )
-      return is;
-
-   arr.resize( boost::extents[walberla::numeric_cast< boost::multi_array_types::index >(rows)] );
-
-   for( std::size_t r = 0; r < rows; ++r )
-      arr[walberla::numeric_cast< boost::multi_array_types::index >(r)] = vec[r];
-
-   return is;
-}
-
-template<typename T>
-std::ostream & operator<< ( std::ostream & os, const boost::multi_array<T,1> & arr )
-{
-   os << "[ ";
-   for( std::size_t c = 0; c < arr.size(); ++c )
-      os << arr[walberla::numeric_cast< boost::multi_array_types::index >(c)] << ",";
-   os << "]";
-
-   return os;
-}
-
-
-
-
-
-
-template<typename T>
-std::istream & operator>> ( std::istream & is, boost::multi_array<T,2> & arr )
-{
-   if ( !is ) return is;
-
-   const std::istream::pos_type pos( is.tellg() );
-   const std::istream::fmtflags oldFlags( is.flags() );
-
-
-   std::vector< std::vector<T> > vec2D;
-   if ( ! parseArray2D( vec2D, is ) ) {
-      is.clear();
-      is.seekg( pos );
-      is.setstate( std::istream::failbit );
-      is.flags( oldFlags );
-      return is;
-   }
-
-   std::size_t rows = vec2D.size();
-   if ( rows == 0 )
-      return is;
-
-   std::size_t cols = vec2D[0].size();
-   for( std::size_t r = 0; r < rows; ++r )
-   {
-      if ( vec2D[r].size() != cols  )
-      {
-         // non square vector
-         is.clear();
-         is.seekg( pos );
-         is.setstate( std::istream::failbit );
-         is.flags( oldFlags );
-         return is;
-      }
-   }
-
-   arr.resize( boost::extents[ walberla::numeric_cast< boost::multi_array_types::index >(rows) ][ walberla::numeric_cast< boost::multi_array_types::index >(cols) ] );
-
-   for( std::size_t r = 0; r < rows; ++r )
-      for( std::size_t c = 0; c < cols; ++c )
-         arr[walberla::numeric_cast< boost::multi_array_types::index >(r)][walberla::numeric_cast< boost::multi_array_types::index >(c)] = vec2D[r][c];
-
-
-   return is;
-}
-
-template<typename T>
-std::ostream & operator<< ( std::ostream & os, const boost::multi_array<T,2> & arr )
-{
-   os << "[\n";
-
-   for( std::size_t r = 0; r < arr.size(); ++r )
-   {
-      os << " [ ";
-      for( std::size_t c = 0; c < arr[walberla::numeric_cast< boost::multi_array_types::index >(r)].size(); ++c ) {
-         os << arr[walberla::numeric_cast< boost::multi_array_types::index >(r)][walberla::numeric_cast< boost::multi_array_types::index >(c)] << "\t";
-      }
-      os << "] \n";
-   }
-   os << "]";
-
-   return os;
-}
-
-
-
-
-
-
-
-
-} // namespace walberla
-
-
diff --git a/src/core/RandomUUID.h b/src/core/RandomUUID.h
index 45c1c0808778c6d84e943ce4e684a73315631e21..7218ba93812c2589baf6ec577ad73ac4534829b3 100644
--- a/src/core/RandomUUID.h
+++ b/src/core/RandomUUID.h
@@ -20,10 +20,13 @@
 
 #pragma once
 
+#include "core/DataTypes.h"
+
 #include <ostream>
 #include <string>
 
-namespace walberla {
+namespace walberla
+{
 
 /**
  * Replacement for boost::uuids::uuid and boost::uuids::random_generator
@@ -34,7 +37,8 @@ class RandomUUID
 {
    friend bool operator==(const RandomUUID& lhs, const RandomUUID& rhs);
    friend bool operator!=(const RandomUUID& lhs, const RandomUUID& rhs);
-public:
+
+ public:
    using UIntType = uint64_t;
 
    RandomUUID();
@@ -47,9 +51,10 @@ public:
     */
    std::string toString() const;
 
-   UIntType getFirstUInt() const {return a_;}
-   UIntType getSecondUInt() const {return b_;}
-private:
+   UIntType getFirstUInt() const { return a_; }
+   UIntType getSecondUInt() const { return b_; }
+
+ private:
    UIntType a_; ///< first part of the uuid
    UIntType b_; ///< second part of the uuid
 };
@@ -58,4 +63,4 @@ bool operator==(const RandomUUID& lhs, const RandomUUID& rhs);
 bool operator!=(const RandomUUID& lhs, const RandomUUID& rhs);
 std::ostream& operator<<(std::ostream& os, const RandomUUID& uuid);
 
-}
+} // namespace walberla
diff --git a/src/core/cell/Cell.h b/src/core/cell/Cell.h
index 8f41297b78a1ff66d4cc7a9f39f98692d11d1b49..f531430ea1733cca2ccf650a944bd92301d623cd 100644
--- a/src/core/cell/Cell.h
+++ b/src/core/cell/Cell.h
@@ -50,9 +50,10 @@ public:
    //@{
    Cell() = default;
    inline Cell( const cell_idx_t _x, const cell_idx_t _y, const cell_idx_t _z ) { cell[0] = _x; cell[1] = _y; cell[2] = _z; }
- //inline Cell( const int        _x, const int        _y, const int        _z );
+   inline Cell( const Vector3<cell_idx_t> _vec ) {cell[0] = _vec[0]; cell[1] = _vec[1]; cell[2] = _vec[2];}
+
    inline Cell( const uint_t     _x, const uint_t     _y, const uint_t     _z );
-   inline Cell( const Vector3<cell_idx_t>& vec ){ cell[0] = vec[0]; cell[1] = vec[1]; cell[2] = vec[2]; };
+   inline Cell( const Vector3<uint_t> _vec );
    //@}
 
    /*! \name Arithmetic operators */
@@ -102,15 +103,6 @@ std::ostream & operator<<( std::ostream & os, const Cell & cell );
 std::istream & operator>>( std::istream & is,       Cell & cell );
 //@}
 
-
-
-// inline Cell::Cell( const int _x, const int _y, const int _z ) {
-//
-//    x() = cell_idx_c( _x ); y() = cell_idx_c( _y ); z() = cell_idx_c( _z );
-// }
-
-
-
 inline Cell::Cell( const uint_t _x, const uint_t _y, const uint_t _z )
 {
    cell[0] = cell_idx_c( _x );
@@ -118,6 +110,12 @@ inline Cell::Cell( const uint_t _x, const uint_t _y, const uint_t _z )
    cell[2] = cell_idx_c( _z );
 }
 
+inline Cell::Cell( const Vector3<uint_t> _vec )
+{
+   cell[0] = cell_idx_c( _vec[0] );
+   cell[1] = cell_idx_c( _vec[1] );
+   cell[2] = cell_idx_c( _vec[2] );
+}
 
 
 /*******************************************************************************************************************//**
diff --git a/src/core/logging/CMakeDefs.in.h b/src/core/logging/CMakeDefs.in.h
index 05abf9750a6219235a69fc76549cdf2139f8dcb9..b0c8d115464299dbed548ccb3911f0362e193c7c 100644
--- a/src/core/logging/CMakeDefs.in.h
+++ b/src/core/logging/CMakeDefs.in.h
@@ -12,3 +12,6 @@
 #cmakedefine WALBERLA_LOGLEVEL_PROGRESS
 #cmakedefine WALBERLA_LOGLEVEL_DETAIL
 #cmakedefine WALBERLA_LOGLEVEL_TRACING
+
+#define WALBERLA_LOGLEVEL ${WALBERLA_LOGLEVEL}
+#define WALBERLA_LOGLEVEL_STRING "${WALBERLA_LOGLEVEL}"
\ No newline at end of file
diff --git a/src/core/logging/Logging.cpp b/src/core/logging/Logging.cpp
index 0c3718d0f37ad16ae845828d22349b6bbc595ec8..bd2dfa403b03b16f7aa619cf3ee6cc12ab5972bc 100644
--- a/src/core/logging/Logging.cpp
+++ b/src/core/logging/Logging.cpp
@@ -43,25 +43,35 @@ const std::string Logging::TRACING_TAG  = std::string( "[TRACING ]" );
 const uint_t Logging::TAG_WIDTH       = uint_t(10);
 const uint_t Logging::TIMESTAMP_WIDTH = uint_t(17);
 
-
-
 void Logging::setStreamLogLevel( LogLevel logLevel )
 {
 #ifndef WALBERLA_LOGLEVEL_INFO
    if( logLevel == INFO )
-      logWarning( "You are trying to set the stream log level to INFO, but INFO logs are deactivated by CMake!", "Logging::setStreamLogLevel", -1 );
+      logWarning( "You are trying to set the stream log level to INFO, but INFO logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=INFO to activate INFO logs.",
+                  "Logging::setStreamLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_PROGRESS
    if( logLevel == PROGRESS )
-      logWarning( "You are trying to set the stream log level to PROGRESS, but PROGRESS logs are deactivated by CMake!", "Logging::setStreamLogLevel", -1 );
+      logWarning( "You are trying to set the stream log level to PROGRESS, but PROGRESS logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=PROGRESS to activate PROGRESS logs.",
+                 "Logging::setStreamLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_DETAIL
    if( logLevel == DETAIL )
-      logWarning( "You are trying to set the stream log level to DETAIL, but DETAIL logs are deactivated by CMake!", "Logging::setStreamLogLevel", -1 );
+      logWarning( "You are trying to set the stream log level to DETAIL, but DETAIL logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=DETAIL to activate DETAIL logs.",
+                  "Logging::setStreamLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_TRACING
    if( logLevel == TRACING )
-      logWarning( "You are trying to set the stream log level to TRACING, but TRACING logs are deactivated by CMake!", "Logging::setStreamLogLevel", -1 );
+      logWarning( "You are trying to set the stream log level to TRACING, but TRACING logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=TRACING to activate TRACING logs.",
+                  "Logging::setStreamLogLevel", -1 );
 #endif
    streamLogLevel_ = logLevel;
 }
@@ -72,19 +82,31 @@ void Logging::setFileLogLevel( LogLevel logLevel )
 {
 #ifndef WALBERLA_LOGLEVEL_INFO
    if( logLevel == INFO )
-      logWarning( "You are trying to set the file log level to INFO, but INFO logs are deactivated by CMake!", "Logging::setFileLogLevel", -1 );
+      logWarning( "You are trying to set the file log level to INFO, but INFO logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=INFO to activate INFO logs.",
+                  "Logging::setFileLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_PROGRESS
    if( logLevel == PROGRESS )
-      logWarning( "You are trying to set the file log level to PROGRESS, but PROGRESS logs are deactivated by CMake!", "Logging::setFileLogLevel", -1 );
+      logWarning( "You are trying to set the file log level to PROGRESS, but PROGRESS logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=PROGRESS to activate PROGRESS logs.",
+                  "Logging::setFileLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_DETAIL
    if( logLevel == DETAIL )
-      logWarning( "You are trying to set the file log level to DETAIL, but DETAIL logs are deactivated by CMake!", "Logging::setFileLogLevel", -1 );
+      logWarning( "You are trying to set the file log level to DETAIL, but DETAIL logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=DETAIL to activate DETAIL logs.",
+                  "Logging::setFileLogLevel", -1 );
 #endif
 #ifndef WALBERLA_LOGLEVEL_TRACING
    if( logLevel == TRACING )
-      logWarning( "You are trying to set the file log level to TRACING, but TRACING logs are deactivated by CMake!", "Logging::setFileLogLevel", -1 );
+      logWarning( "You are trying to set the file log level to TRACING, but TRACING logs are deactivated by CMake!"
+                  "The current WALBERLA_LOGLEVEL is: " WALBERLA_LOGLEVEL_STRING "!"
+                  "Set WALBERLA_LOGLEVEL=TRACING to activate TRACING logs.",
+                  "Logging::setFileLogLevel", -1 );
 #endif
    fileLogLevel_ = logLevel;
 }
@@ -215,4 +237,4 @@ bool Logging::isInIgnoreCallerPaths( const std::vector< walberla::regex > & rege
 
 
 } // namespace logging
-} // namespace walberla
+} // namespace walberla
\ No newline at end of file
diff --git a/src/core/logging/Logging.h b/src/core/logging/Logging.h
index 8d174df51acee3b451cf17a211b1f29c5325ca78..e891ecf5dff1ecf452d1b01b7f59be2d160a5c70 100644
--- a/src/core/logging/Logging.h
+++ b/src/core/logging/Logging.h
@@ -186,7 +186,7 @@ private:
 
 
 inline Logging::Logging() : singleton::Singleton<Logging>(),
-   streamLogLevel_( INFO ), fileLogLevel_( INFO ),
+   streamLogLevel_( WALBERLA_LOGLEVEL ), fileLogLevel_( WALBERLA_LOGLEVEL ),
    processId_( uint_c( mpi::MPIManager::instance()->worldRank() ) ),
    numberOfProcesses_( uint_c( mpi::MPIManager::instance()->numProcesses() ) ),
    startTime_( timing::WcPolicy::getTimestamp() ), showTimeStamp_( true ), logCallerPath_( false )
@@ -654,7 +654,7 @@ public:
    }\
 }
 #else
-#define WALBERLA_LOG_INFO(msg) (void(0))
+#define WALBERLA_LOG_INFO(msg) (void(0));
 #endif
 #ifdef WALBERLA_LOGLEVEL_INFO
 #define WALBERLA_LOG_INFO_ON_ROOT(msg){\
@@ -666,7 +666,7 @@ public:
    }}\
 }
 #else
-#define WALBERLA_LOG_INFO_ON_ROOT(msg) (void(0))
+#define WALBERLA_LOG_INFO_ON_ROOT(msg) (void(0));
 #endif
 
 //////////////
@@ -682,7 +682,7 @@ public:
    }\
 }
 #else
-#define WALBERLA_LOG_PROGRESS(msg) (void(0))
+#define WALBERLA_LOG_PROGRESS(msg) (void(0));
 #endif
 #ifdef WALBERLA_LOGLEVEL_PROGRESS
 #define WALBERLA_LOG_PROGRESS_ON_ROOT(msg){\
@@ -694,7 +694,7 @@ public:
    }}\
 }
 #else
-#define WALBERLA_LOG_PROGRESS_ON_ROOT(msg) (void(0))
+#define WALBERLA_LOG_PROGRESS_ON_ROOT(msg) (void(0));
 #endif
 
 ////////////
@@ -710,7 +710,7 @@ public:
    }\
 }
 #else
-#define WALBERLA_LOG_DETAIL(msg) (void(0))
+#define WALBERLA_LOG_DETAIL(msg) (void(0));
 #endif
 #ifdef WALBERLA_LOGLEVEL_DETAIL
 #define WALBERLA_LOG_DETAIL_ON_ROOT(msg){\
@@ -722,7 +722,7 @@ public:
    }}\
 }
 #else
-#define WALBERLA_LOG_DETAIL_ON_ROOT(msg) (void(0))
+#define WALBERLA_LOG_DETAIL_ON_ROOT(msg) (void(0));
 #endif
 
 /////////////
diff --git a/src/core/logging/Tracing.h b/src/core/logging/Tracing.h
index 7d5a4633a4ba4e6da7ecbbd67c146cfd0376606f..1a4ee2f053df879e3621628d3f43222a2b683450 100644
--- a/src/core/logging/Tracing.h
+++ b/src/core/logging/Tracing.h
@@ -38,7 +38,7 @@
 #ifdef WALBERLA_LOGLEVEL_TRACING
 #  define WALBERLA_TRACE_IN walberla::logging::Tracer walberlaTracingObject( __FUNCTION__, __FILE__, __LINE__ )
 #else
-#  define WALBERLA_TRACE_IN (void(0))
+#  define WALBERLA_TRACE_IN (void(0));
 #endif
 
 
diff --git a/src/core/logging/logging.cmake b/src/core/logging/logging.cmake
index 339c33c9519c93d5878af91b9d2902edad87505b..d0b4da21673baf4f1a9deade4dc5c103709d7985 100644
--- a/src/core/logging/logging.cmake
+++ b/src/core/logging/logging.cmake
@@ -1,5 +1,5 @@
 
-SET( WALBERLA_LOGLEVEL "PROGRESS" CACHE STRING "Set log level at compile time. Possible options: WARNING, INFO, PROGRESS, DETAIL, TRACING" )
+SET( WALBERLA_LOGLEVEL "INFO" CACHE STRING "Set log level at compile time. Possible options: WARNING, INFO, PROGRESS, DETAIL, TRACING" )
 SET_PROPERTY( CACHE WALBERLA_LOGLEVEL PROPERTY STRINGS WARNING INFO PROGRESS DETAIL TRACING )
 
 SET( WALBERLA_LOGLEVEL_WARNING 1 )
diff --git a/src/core/math/CMakeLists.txt b/src/core/math/CMakeLists.txt
index fc18eccf217cd3c2b13358cf5119c3db5f22a065..5f8733c9ae7fbcf9f8f31a50292e34c9b2c6495f 100644
--- a/src/core/math/CMakeLists.txt
+++ b/src/core/math/CMakeLists.txt
@@ -22,8 +22,6 @@ target_sources( core
       Parser.h
       ParserOMP.cpp
       ParserOMP.h
-      PhysicalCheck.cpp
-      PhysicalCheck.h
       Plane.h
       Primes.cpp
       Primes.h
@@ -42,17 +40,4 @@ target_sources( core
       Vector2.h
       Vector3.h
       extern/exprtk.h
-      equation_system/Equation.cpp
-      equation_system/Equation.h
-      equation_system/EquationParser.cpp
-      equation_system/EquationParser.h
-      equation_system/EquationSystem.cpp
-      equation_system/EquationSystem.h
-      equation_system/FwdEquation.h
-      equation_system/FwdOperator.h
-      equation_system/FwdVariable.h
-      equation_system/Operator.cpp
-      equation_system/Operator.h
-      equation_system/Variable.cpp
-      equation_system/Variable.h
       )
diff --git a/src/core/math/PhysicalCheck.cpp b/src/core/math/PhysicalCheck.cpp
deleted file mode 100644
index 7288961752c9ab84be7521e994200bfa7349f63a..0000000000000000000000000000000000000000
--- a/src/core/math/PhysicalCheck.cpp
+++ /dev/null
@@ -1,545 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file PhysicalCheck.cpp
-//! \ingroup core
-//! \author David Staubach <david.staubach@fau.de>
-//
-//======================================================================================================================
-
-#include "waLBerlaDefinitions.h"
-#ifdef WALBERLA_BUILD_WITH_BOOST
-
-#include "PhysicalCheck.h"
-#include "core/Abort.h"
-#include "core/logging/Logging.h"
-#include "core/math/Parser.h"
-#include "equation_system/EquationParser.h"
-
-
-namespace walberla {
-namespace math {
-
-   PhysicalCheck::PhysicalCheck() :
-         solved_( false )
-   {}
-
-   PhysicalCheck::PhysicalCheck( const std::vector< std::string >& equations, const std::map< std::string, std::string >& unitParameterRelations, const std::vector<std::string>& constraints ) :
-         solved_( false )
-   {
-      addEquations( equations );
-      addUnitParameterRelations( unitParameterRelations );
-      addConstraints( constraints );
-   }
-
-   PhysicalCheck::PhysicalCheck( const Config::BlockHandle& configBlock ) :
-      solved_( false )
-   {
-      addBlock( configBlock );
-   }
-
-   void PhysicalCheck::addBlock( const Config::BlockHandle& configBlock )
-   {
-      auto eqBlock = configBlock.getBlock( "Equations" );
-
-      if( eqBlock )
-      {
-         std::vector< std::string > equations;
-         for( auto i=eqBlock.begin(); i!=eqBlock.end(); ++i )
-            equations.push_back(i->second);
-
-         addEquations( equations );
-      }
-
-      auto unitBlock = configBlock.getBlock( "Units" );
-
-      if ( unitBlock )
-      {
-         std::map< std::string, std::string > unitParameterRelations;
-         for( auto i=unitBlock.begin(); i!=unitBlock.end(); ++i )
-            unitParameterRelations.insert(
-                     std::pair<std::string,std::string>(i->first,i->second));
-
-
-         addUnitParameterRelations( unitParameterRelations );
-      }
-
-      auto constraintsBlock = configBlock.getBlock( "Constraints" );
-
-      if( constraintsBlock )
-      {
-         std::vector<std::string> constraints;
-         for( auto i=constraintsBlock.begin(); i!=constraintsBlock.end(); ++i )
-            constraints.push_back(i->second);
-
-         addConstraints( constraints );
-      }
-   }
-
-   void PhysicalCheck::addEquations( const std::vector< std::string >& equations )
-   {
-      EquationParser ep(es_);
-      size_t index = 0;
-
-      solved_ = false;
-
-      for (size_t i=0; i<equations.size(); ++i){
-         index = 0;
-         es_.add( std::to_string(es_.getNumberOfEquations()+1), ep.parseEquation( equations[i], index ) );
-      }
-   }
-
-   void PhysicalCheck::addEquation( const std::string& equation )
-   {
-      EquationParser ep(es_);
-      size_t index = 0;
-
-      solved_ = false;
-
-      es_.add( std::to_string(es_.getNumberOfEquations()+1), ep.parseEquation( equation, index ) );
-   }
-
-   void PhysicalCheck::addUnitParameterRelations( const std::map< std::string, std::string >& unitParameterRelations )
-   {
-      // parse units and store unit as int
-      for( auto i=unitParameterRelations.begin(); i!=unitParameterRelations.end(); ++i )
-      {
-//         unitParameterRelations_[i->first]["m"] = 0;
-//         unitParameterRelations_[i->first]["s"] = 0;
-//         unitParameterRelations_[i->first]["kg"] = 0;
-//         unitParameterRelations_[i->first]["A"] = 0;
-
-         std::string unitString = i->second;
-         int factor = 1;
-
-         for( size_t j=0; j<unitString.size(); ++j )
-         {
-            if( unitString[j] == 'm' || unitString[j] == 's' || unitString[j] == 'A' )
-            {
-               size_t index = j;
-               ++j;
-
-               // make this work for exponents larger than 9
-               if( j < unitString.size() && unitString[j] == '^' )
-               {
-                  ++j;
-                  int expo = factor * std::atoi( &unitString[j] );
-                  if( !setVarUnit( i->first, unitString.substr(index,1), expo ) )
-                     WALBERLA_ABORT( "Error in PhysicalCheck::addUnitParameterRelations(). Non-unique description for unit '" << unitString[index] << "' for parameter '" << i->first << "'." );
-               }
-               else
-               {
-                  --j;
-                  if( !setVarUnit( i->first, unitString.substr(index,1), factor ) )
-                     WALBERLA_ABORT( "Error in PhysicalCheck::addUnitParameterRelations(). Non-unique description for unit '" << unitString[index] << "' for parameter '" << i->first << "'." );
-               }
-            }
-            else if( unitString[j] == 'k' )
-            {
-               size_t index = j;
-               ++j;++j;
-
-               if( j < unitString.size() && unitString[j] == '^' )
-               {
-                  ++j;
-                  int expo = factor * std::atoi( &unitString[j] );
-                  if( !setVarUnit( i->first, unitString.substr(index,2), expo ) )
-                     WALBERLA_ABORT( "Error in PhysicalCheck::addUnitParameterRelations(). Non-unique description for unit 'kg' for parameter '" << i->first << "'.");
-               }
-               else
-               {
-                  --j;
-                  if( !setVarUnit( i->first, unitString.substr(index,2), factor ) )
-                     WALBERLA_ABORT( "Error in PhysicalCheck::addUnitParameterRelations(). Non-unique description for unit 'kg' for parameter '" << i->first << "'.");
-               }
-            }
-            else if( unitString[j] == '/')
-               factor = -1;
-            else
-               continue; // necessary to allow for units of the form: 1/s
-         }
-
-         // add equation for the calculation between lattice and physical units
-         std::stringstream ss;
-         ss << "'" << i->first << "_L' = '" << i->first << "'"<< getParametrizationTerm(i->first);
-         addEquation( ss.str() );
-      }
-   }
-
-   std::string PhysicalCheck::getParametrizationTerm( const std::string& varName )
-   {
-      std::map<std::string,int> parametrizationTerm;
-
-      parametrizationTerm["dx"]  = - unitParameterRelations_[varName]["m"] - 3*unitParameterRelations_[varName]["kg"] - 5*unitParameterRelations_[varName]["A"];
-      parametrizationTerm["dt"]  = - unitParameterRelations_[varName]["s"] + 3*unitParameterRelations_[varName]["A"];
-      parametrizationTerm["rho"] = - unitParameterRelations_[varName]["kg"] - unitParameterRelations_[varName]["A"];
-
-      std::stringstream num;
-      std::stringstream denom;
-      for( auto i=parametrizationTerm.begin(); i!=parametrizationTerm.end(); ++i )
-      {
-         if( i->second == 0 )
-            continue;
-
-         if( i->second < 0 )
-         {
-            if( !denom.str().empty() )
-               denom << " * ";
-            denom << i->first;
-         }
-         else
-         {
-            num << " * " << i->first;
-         }
-
-         if( i->second < -1 )
-            denom << " ^ " << std::abs( i->second );
-         else if( i->second > 1 )
-            num << " ^ " << i->second;
-      }
-
-      if( num.str().empty() && denom.str().empty() )
-         return std::string();
-
-      if( denom.str().empty() )
-      {
-         num << " *";
-         return num.str();
-      }
-
-      if( num.str().empty() )
-         num << " * 1";
-
-      num << " / ( ";
-      num << denom.str() << " )";
-      return num.str();
-   }
-
-   void PhysicalCheck::addConstraints( const std::vector<std::string>& constraints )
-   {
-      for( auto i=constraints.begin(); i!=constraints.end(); ++i )
-         constraints_.push_back( *i );
-   }
-
-   void PhysicalCheck::completeConfig( const shared_ptr<Config>& config )
-   {
-      auto globalBlock = config->getWritableGlobalBlock();
-
-      std::map<std::string,double> symbolTable;
-      getVarMap(symbolTable);
-
-      completeBlock( globalBlock, symbolTable );
-   }
-
-   void PhysicalCheck::completeBlock( Config::Block& configBlock, const std::map<std::string,double>& symbolTable )
-   {
-      if( configBlock.getKey() == "Physical_Check" )
-         return;
-
-      // traverse all parameters in the current block
-      for( auto param=configBlock.begin(); param!=configBlock.end(); ++param )
-      {
-         // check for "'" in the string and erase if present
-         std::string expression(param->second);
-         if( expression[0] == '\'' )
-         {
-            expression.erase( std::remove(expression.begin(), expression.end(), '\''), expression.end() );
-
-            FunctionParser funcParser;
-            try
-            {
-               // hand the expression over to the FunctionParser
-               funcParser.parse(expression);
-            }
-            catch( std::exception& )
-            {
-               WALBERLA_ABORT( "BadSyntaxException when completing Config-File. Block: " << configBlock.getKey()<< ", Parameter: "<< param->first );
-            }
-
-            double result=0;
-            try
-            {
-               result = funcParser.evaluate(symbolTable);
-            }
-            catch( std::exception& )
-            {
-               WALBERLA_ABORT( "UnknownSymbolException when completing Config-File. Block: " << configBlock.getKey() );
-            }
-
-            // set the current parameter to the evaluated result in the current block
-            std::ostringstream os;
-            os << std::setprecision(16) << result;
-            configBlock.setParameter( param->first, os.str() );
-
-            // check for the equality of the parameter and the result
-            double val = configBlock.getParameter<double>( param->first );
-            if( !floatIsEqual( val, result ) )
-               WALBERLA_ABORT( "Error in PhysicalCheck::completeBlock(). Failure when trying to complete Block: " << configBlock.getKey() );
-         }
-      }
-
-      // recursive call of the inner blocks within the current block
-      std::vector<Config::Block*> innerBlocks;
-      configBlock.getWritableBlocks( innerBlocks );
-
-      for( auto innerBlock=innerBlocks.begin();innerBlock!=innerBlocks.end();++innerBlock )
-         completeBlock( **innerBlock, symbolTable );
-   }
-
-   bool PhysicalCheck::isDefined( const std::string& varName )
-   {
-      if( !solved_ )
-         solve();
-
-      return es_.isVarDefined( varName );
-   }
-
-   double PhysicalCheck::getVarValue( const std::string& varName )
-   {
-      if( !isDefined(varName) )
-      {
-         WALBERLA_ABORT( "Error in PhysicalCheck::getVarValue(). Variable not found: " << varName );
-         return 0;
-      }
-
-      return es_.getVarValue(varName);
-   }
-
-   std::string PhysicalCheck::getVarUnit( const std::string& varName )
-   {
-      if( !isDefined(varName) )
-      {
-         WALBERLA_ABORT( "Error in PhysicalCheck::getVarUnit(). Variable not found: " << varName );
-         return std::string();
-      }
-
-      std::stringstream num;
-      std::stringstream denom;
-      for( auto i=unitParameterRelations_[varName].begin(); i!=unitParameterRelations_[varName].end(); ++i )
-      {
-         if( i->second == 0 )
-            continue;
-
-         if( i->second < 0 )
-            denom << i->first;
-         else
-            num << i->first;
-
-         if( i->second < -1 )
-            denom << '^' << std::abs( i->second );
-         else if( i->second > 1 )
-            num << '^' << i->second;
-      }
-
-      if( num.str().empty() && denom.str().empty() )
-         return std::string();
-
-      if( denom.str().empty() )
-         return num.str();
-
-      if( num.str().empty() )
-         num << 1;
-
-      num << '/';
-      num << denom.str();
-      return num.str();
-   }
-
-   bool PhysicalCheck::setVarUnit( const std::string& varName, const std::string& unit, const int expo )
-   {
-      if( unitParameterRelations_[varName][unit] != 0 )
-         return false;
-
-      unitParameterRelations_[varName][unit] = expo;
-      return true;
-   }
-
-   void PhysicalCheck::getVarMap( std::map<std::string,double>& varMap )
-   {
-      if( !solved_ )
-         solve();
-
-      es_.getVarMap( varMap );
-   }
-
-   void PhysicalCheck::solve()
-   {
-      if( !es_.solve() )
-         WALBERLA_ABORT( "Error in PhysicalCheck::solve(). System of equations is not solvable." );
-
-      solved_ = true;
-      WALBERLA_LOG_INFO( "System of equations has been solved successfully.\n" << es_.writeVariables() );
-
-      if( checkConstraints() )
-         WALBERLA_LOG_INFO( "All constraints are fulfilled." );
-   }
-
-   bool PhysicalCheck::checkConstraints()
-   {
-      enum Operator{ LOWER, LOWER_EQUAL, GREATER, GREATER_EQUAL, UNEQUAL, INVALID };
-
-      std::map<std::string,double> symbolTable;
-      getVarMap( symbolTable );
-
-      for( auto i=constraints_.begin(); i!=constraints_.end(); ++i )
-      {
-         std::string constraintString( *i );
-
-         Operator op = INVALID;
-         uint_t lenLHS   = 0;
-         uint_t startRHS = 0;
-
-         // parse constraint and search for operator <,>,<=,>=,!=
-         for( uint_t j=0; j<constraintString.size(); ++j )
-         {
-            switch( constraintString[j] )
-            {
-               case '<':
-                  lenLHS = j;
-                  if( constraintString[j+1] == '=' )
-                  {
-                     op = LOWER_EQUAL;
-                     startRHS = j+2;
-                  }
-                  else
-                  {
-                     op = LOWER;
-                     startRHS = j+1;
-                  }
-                  break;
-               case '>':
-                  lenLHS = j;
-                  if( constraintString[j+1] == '=' )
-                  {
-                     op = GREATER_EQUAL;
-                     startRHS = j+2;
-                  }
-                  else
-                  {
-                     op = GREATER;
-                     startRHS = j+1;
-                  }
-                  break;
-               case '!':
-                  if( constraintString[j+1] == '=' )
-                  {
-                     op = UNEQUAL;
-                     lenLHS = j;
-                     startRHS = j+2;
-                  }
-                  else
-                     WALBERLA_ABORT( "Error in PhysicalCheck::checkConstraints(). Invalid operator '!'." );
-                  break;
-               default:
-                  break;
-            }
-         }
-
-         if ( op == INVALID )
-            WALBERLA_ABORT( "Error in PhysicalCheck::checkConstraints(): No Operator found in " << constraintString );
-
-         // use the FunctionParser to parse and solve the lhs and the rhs, respectively
-         std::string lhs = constraintString.substr( 0, lenLHS );
-         std::string rhs = constraintString.substr( startRHS, constraintString.size()-startRHS );
-
-         FunctionParser funcParser;
-         try
-         {
-            funcParser.parse(lhs);
-         }
-         catch( std::exception& )
-         {
-            WALBERLA_ABORT( "BadSyntaxException when checking constraints. Constraint: '" << lhs << "'" );
-         }
-
-         double resultLHS=0;
-         try
-         {
-            resultLHS = funcParser.evaluate(symbolTable);
-         }
-         catch( std::exception& )
-         {
-            WALBERLA_ABORT( "UnknownSymbolException when checking constraints. Constraint: '" << lhs << "'"  );
-         }
-
-         try
-         {
-            funcParser.parse(rhs);
-         }
-         catch( std::exception& )
-         {
-            WALBERLA_ABORT( "BadSyntaxException when checking constraints. Constraint: '" << rhs << "'" );
-         }
-
-         double resultRHS=0;
-         try
-         {
-            resultRHS = funcParser.evaluate(symbolTable);
-         }
-         catch( std::exception& )
-         {
-            WALBERLA_ABORT( "UnknownSymbolException when checking constraints. Constraint: '" << rhs << "'"  );
-         }
-
-         switch( op )
-         {
-            case LOWER:
-               if( !(resultLHS < resultRHS) )
-                  WALBERLA_ABORT( "Constraint '" << constraintString << "' failed." );
-               break;
-            case LOWER_EQUAL:
-               if( !(resultLHS <= resultRHS) )
-                  WALBERLA_ABORT( "Constraint '" << constraintString << "' failed." );
-               break;
-            case GREATER:
-               if( !(resultLHS > resultRHS) )
-                  WALBERLA_ABORT( "Constraint '" << constraintString << "' failed." );
-               break;
-            case GREATER_EQUAL:
-               if( !(resultLHS >= resultRHS) )
-                  WALBERLA_ABORT( "Constraint '" << constraintString << "' failed." );
-               break;
-            case UNEQUAL:
-               if( floatIsEqual(resultLHS,resultRHS) )
-                  WALBERLA_ABORT( "Constraint '" << constraintString << "' failed." );
-               break;
-            default:
-               WALBERLA_ABORT( "Error in PhysicalCheck::checkConstraints(). Entered unreachable code." );
-               break;
-         }
-      }
-
-      return true;
-   }
-
-   void PhysicalCheck::writeUnitParameterRelations()
-   {
-      std::stringstream ss;
-      ss << "Unit-Parameter-Relations:" << std::endl;
-      for( auto i=unitParameterRelations_.begin(); i!=unitParameterRelations_.end(); ++i )
-      {
-         ss << i->first << " (" << getVarUnit(i->first) << ")" << std::endl;
-      }
-   }
-
-   std::ostream& operator<<( std::ostream& os, PhysicalCheck& pc )
-   {
-      return os << pc.es_;
-   }
-
-} // namespace math
-} // namespace walberla
-
-#endif
diff --git a/src/core/math/PhysicalCheck.h b/src/core/math/PhysicalCheck.h
deleted file mode 100644
index 857b292f404f96b2bec8fed07ed036963afefea4..0000000000000000000000000000000000000000
--- a/src/core/math/PhysicalCheck.h
+++ /dev/null
@@ -1,138 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file PhysicalCheck.h
-//! \ingroup core
-//! \author David Staubach <david.staubach@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "core/config/Config.h"
-#include "equation_system/EquationSystem.h"
-
-#include <map>
-#include <string>
-#include <vector>
-
-
-namespace walberla {
-namespace math {
-
-   //===================================================================================================================
-   //
-   //  CLASS DEFINITION
-   //
-   //===================================================================================================================
-
-   //*******************************************************************************************************************
-   /*!\brief Wrapper class to check for physical properties and correctness of given input parameters
-    *
-    * This class serves as an interface between the given parameters in an input file and the
-    * math module classes that solve the equations and expressions of interest.
-    * It can be created either by passing a vector of equations, a map of unit-parameter relations and
-    * a vector of constraints, or by passing a BlockHandle to the input file.
-    *
-    * The blocks for the PhysicalCheck in the input file are bound to the following layout:
-    *
-    * Physical_Check {
-    *    Equations {
-    *       eq0 parameter1 = 23;
-    *       eq1 parameter2 = 42;
-    *
-    *       eq2 var1 = parameter1 + cos(parameter2);
-    *       eq3 var2 = parameter2 - 23;
-    *    }
-    *    Units {
-    *       parameter1 m;
-    *       parameter2 m^2/s;
-    *    }
-    *    Constraints {
-    *       co0 var1 > 20;
-    *       co1 var1 <= 30;
-    *       co2 var2 != var1;
-    *    }
-    * }
-    *
-    * Geometry {
-    *    BoundaryConditionXYZ {
-    *       velocity 'parameter1';
-    *       pressure 'var1 * 29.9';
-    *    }
-    *
-    *    BoundaryConditionABC {
-    *       velocity 'parameter1';
-    *    }
-    * }
-    */
-   class PhysicalCheck
-   {
-   public:
-
-      //**Constructors*****************************************************************************
-      PhysicalCheck();
-      PhysicalCheck( const std::vector<std::string>& equations, const std::map< std::string, std::string >& unitParameterRelations, const std::vector<std::string>& constraints );
-      PhysicalCheck( const Config::BlockHandle& configBlock );
-
-      //**Utility functions************************************************************************
-      /*! \name Utility functions */
-      //@{
-      void addBlock                  ( const Config::BlockHandle& configBlock );
-      void addEquations              ( const std::vector<std::string>& equations );
-      void addUnitParameterRelations ( const std::map< std::string, std::string >& unitParameterRelations );
-      void addConstraints            ( const std::vector<std::string>& constraints );
-      void completeConfig            ( const shared_ptr<Config>& config );
-      //@}
-      //****************************************************************************************************************
-
-      //**Get functions****************************************************************************
-      /*! \name Get functions */
-      //@{
-      bool   isDefined  ( const std::string& varName );
-      double getVarValue( const std::string& varName );
-      //@}
-      //****************************************************************************************************************
-
-      //**Output functions*************************************************************************
-      /*! \name Output functions */
-      //@{
-      friend std::ostream& operator<<( std::ostream& os, PhysicalCheck& pc );
-      //@}
-      //****************************************************************************************************************
-   private:
-      //**Private functions to setup data layout from the given input file*************************
-      /*! \name Private functions */
-      //@{
-      void addEquation                   ( const std::string& equation );
-      std::string getParametrizationTerm ( const std::string& varName );
-      void solve                         ();
-      bool checkConstraints              ();
-      void completeBlock                 ( Config::Block& configBlock, const std::map<std::string,double>& symbolTable );
-      std::string getVarUnit             ( const std::string& varName );
-      bool setVarUnit                    ( const std::string& varName, const std::string& unit, const int expo );
-      void getVarMap                     ( std::map<std::string,double>& varMap );
-      void writeUnitParameterRelations   ();
-      //@}
-      //****************************************************************************************************************
-
-      EquationSystem es_;
-      std::map<std::string, std::map<std::string, int> > unitParameterRelations_;
-      std::vector<std::string> constraints_;
-      bool solved_;
-   };
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/Uint.h b/src/core/math/Uint.h
index c5fc28d833f3d78cd360cd4f10d214277898943f..d15bc2f3db83cb9b9845d2791420edeaa94f3ba0 100644
--- a/src/core/math/Uint.h
+++ b/src/core/math/Uint.h
@@ -168,7 +168,8 @@ template<> uint_t uintMSBPosition< uint64_t >( uint64_t value ); // -> Uint.cpp
 
 template<> inline uint_t uintMSBPosition< uint32_t >( uint32_t value ) {
 
-   uint32_t i, j;
+   uint32_t i;
+   uint32_t j;
 
    j = value >> 16;
    if( j != 0 ) {
diff --git a/src/core/math/all.h b/src/core/math/all.h
index 0d5c22ad7d8b0f11e5887e47712ee0e36dcc9fdb..a1c1c9e1bc38113246d5a0c11e2a41baad0e2a83 100644
--- a/src/core/math/all.h
+++ b/src/core/math/all.h
@@ -36,9 +36,6 @@
 #include "Matrix3.h"
 #include "Parser.h"
 #include "ParserOMP.h"
-#ifdef WALBERLA_BUILD_WITH_BOOST
-#include "PhysicalCheck.h"
-#endif
 #include "Plane.h"
 #include "Primes.h"
 #include "Random.h"
@@ -48,5 +45,3 @@
 #include "Utility.h"
 #include "Vector2.h"
 #include "Vector3.h"
-
-#include "equation_system/all.h"
diff --git a/src/core/math/equation_system/Equation.cpp b/src/core/math/equation_system/Equation.cpp
deleted file mode 100644
index 96717ca97ae89dca49dc9730448a264fd08fc969..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Equation.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Equation.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "Equation.h"
-#include "Operator.h"
-#include "Variable.h"
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-
-
-namespace walberla {
-namespace math {
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // NODE
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   Node::Node( const double  value  ) : nodeType_(NT_CONSTANT), value_(value  ), opType_(OP_NO )           {}
-   Node::Node( const VarPtr& var    ) : nodeType_(NT_VARIABLE), value_(FP_NAN), opType_(OP_NO ), var_(var) {}
-   Node::Node(       OpType& opType ) : nodeType_(NT_OPERATOR), value_(FP_NAN), opType_(opType)            {}
-
-   void Node::setVar( const VarPtr& var    ){ var_ = var; }
-
-   void Node::collectVariables( VarMap& varMap ){
-      switch (nodeType_)
-      {
-      case NT_CONSTANT:
-         break;
-      case NT_VARIABLE:
-         if ( varMap.find( var_->getName() ) == varMap.end() )
-            varMap[var_->getName()] = var_;
-         break;
-      case NT_OPERATOR:
-         left_->collectVariables ( varMap );
-         right_->collectVariables( varMap );
-         break;
-      default:
-    	  WALBERLA_ABORT( "No correct node type" );
-    	  break;
-      }
-   }
-
-   uint_t Node::countUnknownVariables(){
-      switch (nodeType_)
-      {
-      case NT_CONSTANT:
-         return 0;
-      case NT_VARIABLE:
-         return var_->valid() ? 0 : 1;
-      case NT_OPERATOR:
-         return left_->countUnknownVariables() + right_->countUnknownVariables();
-      default:
-    	 WALBERLA_ABORT( "No correct node type" );
-    	 return 0; // has no effect
-    	 break;
-      }
-   }
-
-   double Node::compute(){
-      switch (nodeType_)
-      {
-      case NT_CONSTANT:
-         return value_;
-      case NT_VARIABLE:
-         return var_->getValue();
-      case NT_OPERATOR:
-         return opType_(left_->compute(), right_->compute());
-      default:
-    	 WALBERLA_ABORT( "No correct node type" );
-    	 return 0; // has no effect
-    	 break;
-      }
-   }
-
-   bool Node::findUnknownVariable(){
-      switch (nodeType_)
-      {
-      case NT_CONSTANT:
-         return false;
-      case NT_VARIABLE:
-         return !var_->valid();
-      case NT_OPERATOR:
-         if(left_->findUnknownVariable()){
-            nodeDir_ = ND_LEFT;
-            return true;
-         }
-         if(right_->findUnknownVariable()){
-            nodeDir_ = ND_RIGHT;
-            return true;
-         }
-         return false;
-      default:
-    	 WALBERLA_ABORT( "No correct node type" );
-    	 return false; // has no effect
-    	 break;
-      }
-   }
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // EQUATION
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   Equation::Equation( const NodePtr& root) : root_ (root)
-   {
-      root_->collectVariables( varMap_ );
-   }
-
-   bool Equation::evaluate(){
-      if (!isEvaluatable())
-    	 WALBERLA_ABORT( "Equation is not evaluatable" );
-
-      double left  = root_->left_->compute();
-      double right = root_->right_->compute();
-
-      if ( std::isnan(left) && std::isnan(right) ){
-         //WALBERLA_LOG_WARNING( "WARNING: Both values are NAN -> return true" );
-         return true;
-      } else if ( std::isinf(left) && std::isinf(right) ){
-    	 //WALBERLA_LOG_WARNING( "WARNING: Both values are INF -> return true" );
-         return true;
-      }
-
-      const double border = std::max(
-         std::fabs(left/2e12 + right/2e12),
-         std::fabs(left/2e12 - right/2e12) );
-
-      return std::fabs( left - right ) < std::max( border, std::numeric_limits<double>::epsilon() );
-   }
-
-   VarPtr Equation::compute(){
-      if (!isComputable())
-    	 WALBERLA_ABORT( "Equation is not computable" );
-
-      sort();
-
-      root_->left_->var_->setValue( root_->right_->compute() );
-
-      return root_->left_->var_;
-   }
-
-
-   void Equation::sort(){
-      if ( root_->right_->findUnknownVariable() )
-         root_->flip();
-      else
-         root_->left_->findUnknownVariable();
-
-      while( root_->left_->nodeType_ == NT_OPERATOR ){
-         if ( root_->left_->opType_ == OP_PLUS )
-         {
-            rotate( (root_->left_->nodeDir_ == ND_RIGHT), OP_MINUS, OP_MINUS );
-         }
-         else if ( root_->left_->opType_ == OP_MINUS )
-         {
-            rotate( false, OP_PLUS, OP_MINUS );
-         }
-         else if ( root_->left_->opType_ == OP_MULT )
-         {
-            rotate( (root_->left_->nodeDir_ == ND_RIGHT), OP_DIV, OP_DIV );
-         }
-         else if ( root_->left_->opType_ == OP_DIV )
-         {
-            rotate( false, OP_MULT, OP_DIV );
-         }
-         else if ( root_->left_->opType_ == OP_PROD )
-         {
-            //rotate( (root_->left_->nodeDir_ == ND_RIGHT), OP_ROOT, OP_LOG );
-            rotate( (root_->left_->nodeDir_ == ND_RIGHT), OP_PROD, OP_LOG );
-         }
-         else if ( root_->left_->opType_ == OP_LOG )
-         {
-            //rotate( (root_->left_->nodeDir_ == ND_LEFT), OP_PROD, OP_ROOT );
-            rotate( (root_->left_->nodeDir_ == ND_LEFT), OP_PROD, OP_PROD );
-         }
-         //else if ( root_->left_->opType_ == OP_ROOT )
-         //{
-         //   rotate( false, OP_PROD, OP_LOG );
-         //}
-         else
-        	WALBERLA_ABORT( "Unknown operator" );
-      }
-   }
-
-   void Equation::rotate(bool flip, OpType& leftOp, OpType& rightOp){
-      NodePtr newNode;
-      if ( root_->left_->nodeDir_ == ND_LEFT ){
-         newNode = std::make_shared<Node>( leftOp );
-         if (flip){
-            newNode->left_  = root_->left_->right_;
-            newNode->right_ = root_->right_;
-         } else {
-            newNode->right_ = root_->left_->right_;
-            newNode->left_  = root_->right_;
-         }
-         root_->left_ = root_->left_->left_;
-      } else {
-         newNode = std::make_shared<Node>( rightOp );
-         if (flip){
-            newNode->right_ = root_->left_->left_;
-            newNode->left_  = root_->right_;
-         } else {
-            newNode->left_  = root_->left_->left_;
-            newNode->right_ = root_->right_;
-         }
-         root_->left_ = root_->left_->right_;
-      }
-      root_->right_ = newNode;
-   }
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // OUTPUT
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-   std::ostream& operator<<( std::ostream& os, const Node & node ){
-      switch (node.nodeType_)
-      {
-      case NT_CONSTANT:
-         os << node.value_;
-         break;
-      case NT_VARIABLE:
-         os << node.var_->getName();
-         break;
-      case NT_OPERATOR:
-         if (node.opType_ == OP_EQUAL)
-            os << *node.left_ << node.opType_ << *node.right_;
-         else if( node.opType_ == OP_LOG )
-            os << "log(" << *node.left_ << ", " << *node.right_ << ")";
-         else
-            os << "(" << *node.left_ << node.opType_ << *node.right_ << ")";
-         break;
-      default:
-    	 WALBERLA_ABORT( "No correct node type" );
-    	 break;
-      }
-      return os;
-   }
-
-   std::ostream& operator<<( std::ostream& os, const Equation & eq ){
-      return os << *eq.root_;
-   }
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/Equation.h b/src/core/math/equation_system/Equation.h
deleted file mode 100644
index 3ccbb14f7ef2ca2842222878448e44c10e3e7dfd..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Equation.h
+++ /dev/null
@@ -1,113 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Equation.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FwdEquation.h"
-#include "FwdOperator.h"
-#include "FwdVariable.h"
-#include "core/Abort.h"
-
-
-namespace walberla {
-namespace math {
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // NODE
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   class Node
-   {
-      friend class Equation;
-
-   private:
-      const NodeType    nodeType_;
-      const double      value_;
-            NodeDir     nodeDir_;
-            OpType&     opType_;
-
-      VarPtr    var_;
-      NodePtr   left_;
-      NodePtr   right_;
-
-   public:
-      Node( const double  value  );
-      Node( const VarPtr& var   );
-      Node(       OpType& opType );
-   private:
-      Node& operator=( const Node& ){ return *this; }
-
-   public:
-      friend std::ostream& operator<<( std::ostream& os, const Node & node );
-
-   private:
-      uint_t countUnknownVariables();
-      bool         findUnknownVariable();
-
-      void collectVariables( VarMap& varMap );
-
-      void flip(){ left_.swap(right_); }
-
-   public:
-      double compute();
-
-      NodePtr& left () { return left_;  }
-      NodePtr& right() { return right_; }
-
-      void setVar( const VarPtr& var    );
-      void setOp (       OpType& opType );
-   };
-   // end class Node
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // EQUATION
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   class Equation
-   {
-   public:
-      NodePtr root_;
-      VarMap  varMap_;
-
-   public:
-      Equation( const NodePtr& root );
-
-   private:
-      uint_t countUnknownVariables(){ return uint_c( root_->countUnknownVariables() ); }
-
-   public:
-      friend std::ostream& operator<<( std::ostream& os, const Equation & eq );
-
-      bool isComputable()  { return countUnknownVariables() == 1; }
-      bool isEvaluatable() { return countUnknownVariables() == 0; }
-
-      bool   evaluate();
-      VarPtr compute();
-
-   private:
-      void sort();
-      void rotate(bool flip, OpType& leftOp, OpType& rightOp);
-
-   public:
-      const VarMap& getVarMap() { return varMap_; }
-   };
-   // end class Equation
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/EquationParser.cpp b/src/core/math/equation_system/EquationParser.cpp
deleted file mode 100644
index d5bdb0aafa54228fcecc356c9d24f877a65f5d80..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/EquationParser.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file EquationParser.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "waLBerlaDefinitions.h"
-#ifdef WALBERLA_BUILD_WITH_BOOST
-
-#include "Equation.h"
-#include "EquationParser.h"
-#include "Operator.h"
-#include "Variable.h"
-#include "core/math/Constants.h"
-#include "core/StringUtility.h"
-
-#include <memory>
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE UTIL
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-#define THROW(msg, str, index) {\
-   std::stringstream ss;\
-   ss << (msg) << " -> [" << (str) << "] at [" << (index) << "]";\
-   throw std::runtime_error( ss.str() );\
-}
-
-namespace walberla {
-namespace math {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE NUMBER
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-bool EquationParser::checkNumber( const std::string& str, size_t& index ) const
-{
-   if(str[index] == '+' || str[index] == '-')
-      return isdigit(str[index+1]) != int(0);
-   return isdigit(str[index]) != int(0);
-}
-
-NodePtr EquationParser::parseNumber( const std::string& str, size_t& index ) const
-{
-   size_t start = index;
-   double value;
-
-   if(str[index] == '+' || str[index] == '-')
-      ++index;
-
-   if( isdigit(str[index]) == int(0) )
-      THROW( "No number found", str, index );
-
-   while( isdigit(str[++index]) );
-
-   // numbers are allowed to end with a '.'
-   if ( str[index] == '.' )
-      while( isdigit(str[++index]) != int(0) );
-
-   if ( str[index] == 'e' || str[index] == 'E' ){
-      ++index;
-      size_t estart = index;
-      if( str[index] == '+' || str[index] == '-' )
-         ++index;
-      if( isdigit(str[index]) == int(0) )
-         THROW( "Number ends with 'e'", str, index );
-      while( isdigit(str[++index]) != int(0) );
-
-      value =  std::stod( str.substr(start, estart-start-1) ) *
-            pow(10.0, std::stoi( str.substr(estart, index-estart) ) );
-   } else {
-      value = std::stod( str.substr(start, index-start) );
-   }
-
-   return std::make_shared<Node>( value );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE NT_VARIABLE
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-bool EquationParser::checkVariable( const std::string& str, size_t& index ) const
-{
-   if(str[index] == '+' || str[index] == '-')
-      return (isalpha(str[index+1]) != int(0)) || (str[index+1] == '\'');
-   return (isalpha(str[index]) != int(0)) || (str[index] == '\'');
-}
-
-NodePtr EquationParser::parseVariable( const std::string& str, size_t& index ) const
-{
-   bool sign = false;
-   if(str[index] == '+' || str[index] == '-'){
-      sign = str[index] == '-';
-      ++index;
-   }
-
-   // variables can start with a '
-   bool marked = (str[index] == '\'');
-   if ( marked )
-      ++index;
-
-   if ( isalpha(str[index]) == int(0) )
-      THROW( "Variable name has to start with a letter", str, index );
-
-   size_t start = index;
-   size_t len;
-
-   for (
-         len=1, ++index;
-         (isalpha(str[index]) != int(0)) || (isdigit(str[index]) != int(0)) || str[index] == '_';
-         ++len, ++index );
-
-   if ( marked )
-   {
-      if (str[index] == '\'' ){
-         ++index;
-      } else {
-         THROW( "Variable declaration has to end with '", str, index );
-      }
-   }
-
-   std::string name = str.substr(start, len);
-
-   VarPtr varPtr;
-   if ( es_.varMap_.find(name) != es_.varMap_.end() ){
-      varPtr = es_.varMap_[name];
-   } else {
-      varPtr = std::make_shared<Var>( name );
-      es_.varMap_[name] = varPtr;
-   }
-
-   NodePtr nodePtr;
-   if (sign){
-      nodePtr.reset( new Node(OP_MULT) );
-      nodePtr->left().reset(  new Node(    -1) );
-      nodePtr->right().reset( new Node(varPtr) );
-   } else {
-      nodePtr.reset( new Node(varPtr) );
-   }
-   return nodePtr;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE FUNCTION
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-bool EquationParser::checkFunction( const std::string& str, size_t& index ) const
-{
-   return (str.substr(index, 4) == "exp(") ||
-         (str.substr(index, 3) == "ln(") ||
-         (str.substr(index, 5) == "sqrt(");
-}
-
-NodePtr EquationParser::parseFunction( const std::string& str, size_t& index ) const
-{
-   OpFunction opFunc;
-   if ( str.substr(index, 4) == "exp(" ){
-      opFunc = OP_FUNC_EXP;
-      index += 4;
-   } else if ( str.substr(index, 3) == "ln(" ){
-      opFunc = OP_FUNC_LN;
-      index += 3;
-   } else if ( str.substr(index, 5) == "sqrt(" ){
-      opFunc = OP_FUNC_SQRT;
-      index += 5;
-   } else {
-      THROW( "Found no function", str, index );
-   }
-   NodePtr nodePtr = parseExpression(str, index);
-   if ( ! (str[index] == ')') )
-      THROW( "Found no enclosing parenthesis", str, index );
-   ++index;
-
-   NodePtr funcPtr;
-
-   switch(opFunc)
-   {
-   case OP_FUNC_EXP:
-      funcPtr = std::make_shared<Node>( OP_PROD );
-      funcPtr->left()  = std::make_shared<Node>( math::e  );
-      funcPtr->right() = nodePtr;
-      return funcPtr;
-   case OP_FUNC_LN:
-      funcPtr = std::make_shared<Node>( OP_LOG );
-      funcPtr->right() = std::make_shared<Node>( math::e  );
-      funcPtr->left()  = nodePtr;
-      return funcPtr;
-   case OP_FUNC_SQRT:
-      funcPtr = std::make_shared<Node>( OP_PROD );
-      funcPtr->left()  = nodePtr;
-      funcPtr->right() = std::make_shared<Node>( 0.5 );
-      return funcPtr;
-   default:
-      WALBERLA_ABORT( "Function not yet defined" );
-      break;
-   }
-   return funcPtr; // has no effect
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE EXPRESSION
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//**********************************************************************************************************************
-/*!
-*   Parses a given term inside the current expression
-*
-*   Goal: creating a binary tree node for the corresponding term
-*
-*   \param  str           string representation of the term
-*   \param  index         index of the current term within the equation
-*/
-//**********************************************************************************************************************
-NodePtr EquationParser::parseTerm( const std::string& str, size_t& index ) const
-{
-   NodePtr nodePtr;
-
-   // check for the type of the current term
-   if ( str[index] == '(' ){
-      nodePtr = parseExpression(str, ++index);
-      if ( ! (str[index] == ')') )
-         THROW( "Found no enclosing parenthesis", str, index );
-      ++index;
-   } else if ( checkFunction(str, index) ) {
-      nodePtr = parseFunction(str, index);
-   } else if ( checkVariable(str, index) ) {
-      nodePtr = parseVariable(str, index);
-   } else if ( checkNumber(str, index) ) {
-      nodePtr = parseNumber(str, index);
-   } else {
-      THROW( "Found no parenthesis, variable or number", str, index );
-   }
-   return nodePtr;
-}
-
-//**********************************************************************************************************************
-/*!
-*   Parses a given expression inside the current Equation
-*
-*   Goal: modeling the current expression in binary tree format
-*
-*   \param  str           string representation of the expression
-*   \param  index         index of the current term within the equation
-*/
-//**********************************************************************************************************************
-NodePtr EquationParser::parseExpression( const std::string& str, size_t& index ) const
-{
-   NodePtr leftPtr = parseTerm(str, index);
-
-   // index has been shifted to next term by parseTerm function
-   size_t indexFstOp = index;
-   if ( str[index] == '=' || str.size() == index || str[index] == ')'){
-      return leftPtr;
-   } else if ( isop(str[index]) ){
-      ++index;
-   } else {
-      THROW( "Found no operator or equal", str, index );
-   }
-
-   NodePtr rightPtr;
-   bool run = true;
-   do {
-      rightPtr = parseTerm(str, index);
-
-      size_t indexSndOp = index;
-      if ( str[index] == '=' || str.size() == index || str[index] == ')'){
-         NodePtr nodePtr ( new Node (getOp(str[indexFstOp])) );
-         nodePtr->left()  = leftPtr;
-         nodePtr->right() = rightPtr;
-         return nodePtr;
-      } else if ( isop(str[index]) ){
-         ++index;
-      } else {
-         THROW( "Found no operator or equal", str, index );
-      }
-
-      OpType& opFst = getOp(str[indexFstOp]);
-      OpType& opSnd = getOp(str[indexSndOp]);
-
-      if (opFst >= opSnd){
-         NodePtr nodePtr ( new Node (getOp(str[indexFstOp])) );
-         nodePtr->left()  = leftPtr;
-         nodePtr->right() = rightPtr;
-         leftPtr = nodePtr;
-         indexFstOp = indexSndOp;
-      } else {
-         break;
-      }
-   } while ( run );
-
-   index = indexFstOp+1;
-   rightPtr = parseExpression(str, index);
-
-   NodePtr nodePtr ( new Node (getOp(str[indexFstOp])) );
-   nodePtr->left()  = leftPtr;
-   nodePtr->right() = rightPtr;
-   return nodePtr;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// PARSE EQUATION
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//**********************************************************************************************************************
-/*!
-*   Parses a given Equation and constructs a binary tree as representation
-*
-*   Goal: Equation is modeled in a binary tree format to be solved later on
-*
-*   \param  str           string representation of the equation
-*   \param  index         index of the current term (is always zero here)
-*/
-//**********************************************************************************************************************
-EquationPtr EquationParser::parseEquation( const std::string& str, size_t& index )
-{
-   // removing leading and trailing spaces of input string
-   std::string trimmedStr = string_trim_copy(str);
-   // removing spaces inside the trimmed string
-   trimmedStr.erase(std::remove(trimmedStr.begin(), trimmedStr.end(), ' '), trimmedStr.end());
-   NodePtr leftPtr = parseExpression(trimmedStr, index);
-   if ( ! (trimmedStr[index] == '=') )
-      THROW( "Found no equal sign in equation", str, index );
-   ++index;
-
-   NodePtr rightPtr = parseExpression(trimmedStr, index);
-
-   NodePtr nodePtr ( new Node (OP_EQUAL) );
-   nodePtr->left()  = leftPtr;
-   nodePtr->right() = rightPtr;
-
-   return std::make_shared<Equation>( nodePtr );
-}
-
-} // namespace math
-} // namespace walberla
-
-#endif
\ No newline at end of file
diff --git a/src/core/math/equation_system/EquationParser.h b/src/core/math/equation_system/EquationParser.h
deleted file mode 100644
index e32287be8926f996fbb60073bd43eadf8b5d2506..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/EquationParser.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file EquationParser.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "EquationSystem.h"
-#include "core/logging/Logging.h"
-
-#include <sstream>
-
-namespace walberla {
-namespace math {
-
-   class EquationParser
-   {
-   private:
-      EquationSystem& es_;
-
-   public:
-      EquationParser( EquationSystem& es) : es_(es) { }
-
-   private:
-      EquationParser& operator=( const EquationParser& ) { return *this; }
-
-   private:
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // PARSE NUMBER
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      bool    checkNumber( const std::string& str, size_t& index ) const;
-      NodePtr parseNumber( const std::string& str, size_t& index ) const;
-
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // PARSE NT_VARIABLE
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      bool    checkVariable( const std::string& str, size_t& index ) const;
-      NodePtr parseVariable( const std::string& str, size_t& index ) const;
-
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // PARSE FUNCTION
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      enum OpFunction{
-         OP_FUNC_EXP,
-         OP_FUNC_LN,
-         OP_FUNC_SQRT
-      };
-
-      bool    checkFunction( const std::string& str, size_t& index ) const;
-      NodePtr parseFunction( const std::string& str, size_t& index ) const;
-
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // PARSE EXPRESSION
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      NodePtr parseTerm      ( const std::string& str, size_t& index ) const;
-      NodePtr parseExpression( const std::string& str, size_t& index ) const;
-
-   public:
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      // PARSE EQUATION
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      EquationPtr parseEquation( const std::string& str, size_t& index );
-   };
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/EquationSystem.cpp b/src/core/math/equation_system/EquationSystem.cpp
deleted file mode 100644
index ef68ac88aff3e89d6af02890fdfb31c2ab405162..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/EquationSystem.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file EquationSystem.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "waLBerlaDefinitions.h"
-#ifdef WALBERLA_BUILD_WITH_BOOST
-
-#include "Equation.h"
-#include "EquationSystem.h"
-#include "Variable.h"
-#include "core/Abort.h"
-#include "core/debug/Debug.h"
-#include "core/logging/Logging.h"
-
-
-#ifdef _MSC_VER
-#pragma warning ( push, 1 )
-#pragma warning ( disable: 4701 )
-#endif
-
-#include <boost/graph/adjacency_list_io.hpp>
-#include <boost/graph/max_cardinality_matching.hpp>
-
-#ifdef _MSC_VER
-#pragma warning ( pop )
-#endif
-
-namespace walberla {
-namespace math {
-
-void EquationSystem::add(const std::string& key, const EquationPtr& eq)
-{
-   if ( eqMap_.find(key) != eqMap_.end() )
-      WALBERLA_ABORT( "Equation already exists" );
-
-   eqMap_[key]      = eq;
-   eqVertices_[key] = boost::add_vertex(eqGraph_);
-
-   for(VarMapIt it = eq->getVarMap().begin(); it != eq->getVarMap().end(); ++it)
-   {
-      if ( varVertices_.find(it->first) == varVertices_.end() )
-      {
-         varVertices_[it->first] = boost::add_vertex(eqGraph_);
-      }
-      boost::add_edge(eqVertices_[key], varVertices_[it->first], eqGraph_);
-   }
-}
-
-void EquationSystem::clear( )
-{
-   eqMap_.clear();
-   varMap_.clear();
-}
-
-void EquationSystem::remove(const std::string& key)
-{
-   eqMap_.erase(key);
-}
-
-void EquationSystem::match()
-{
-   std::cout << "\nEquation Nodes:\n";
-   for(EqMapIt it = eqMap_.begin(); it != eqMap_.end(); ++it)
-   {
-      //std::cout << *it->second << "\n";
-      WALBERLA_LOG_RESULT( *it->second );
-   }
-
-   std::cout << "\nVariable Nodes:\n";
-   for(VarMapIt it = varMap_.begin(); it != varMap_.end(); ++it)
-   {
-      //std::cout << *it->second << "\n";
-      WALBERLA_LOG_RESULT( *it->second );
-   }
-
-   //std::cout << "\nInput Graph:\n" << boost::write( eqGraph_ );
-   WALBERLA_LOG_RESULT( "\nInput Graph:\n" << boost::write( eqGraph_ ) );
-
-   std::vector<EqGraph::vertex_descriptor> mate( boost::num_vertices(eqGraph_) );
-   WALBERLA_ASSERT( boost::checked_edmonds_maximum_cardinality_matching(eqGraph_, &mate[0]) );
-
-   WALBERLA_LOG_RESULT( "Maximum matching:" );
-   EqGraph::vertex_iterator vi;
-   EqGraph::vertex_iterator vi_end;
-   for(boost::tie(vi,vi_end) = vertices(eqGraph_); vi != vi_end; ++vi)
-      if (mate[*vi] != boost::graph_traits<EqGraph>::null_vertex() && *vi < mate[*vi])
-         //std::cout << "{" << *vi << ", " << mate[*vi] << "}" << std::endl;
-         WALBERLA_LOG_RESULT( "{" << *vi << ", " << mate[*vi] << "}" );
-}
-
-bool EquationSystem::solve()
-{
-   bool change = true;
-   while(change){
-      change = false;
-      EqMapIt it = eqMap_.begin();
-      for ( ; it != eqMap_.end(); ++it ){
-         EquationPtr eq = it->second;
-         if (eq->isComputable()){
-            VarPtr varPtr = eq->compute();
-            change = true;
-         } else if (eq->isEvaluatable()){
-            //eq->evaluate();
-            if (!eq->evaluate())
-               //std::cout << "Equation is not evaluatable! " << *eq << " -> " << eq->root_->left()->compute() << "!=" << eq->root_->right()->compute() << std::endl;
-               WALBERLA_ABORT( "Equation is not evaluatable! " << *eq << " -> " << eq->root_->left()->compute() << "!=" << eq->root_->right()->compute() );
-         /*} else {
-            //std::cout << "Equation '" << *eq << "' is neither computable nor evaluatable!" << std::endl;
-            WALBERLA_LOG_RESULT( "Equation '" << *eq << "' is neither computable nor evaluatable!" );*/
-         }
-      }
-   }
-
-   /*for ( VarMapIt it = varMap_.begin(); it != varMap_.end(); ++it ){
-      //std::cout << *it->second << std::endl;
-      WALBERLA_LOG_RESULT( *it->second );
-   }*/
-   bool evaluatable = true;
-   EqMapIt it = eqMap_.begin();
-   for ( ; it != eqMap_.end(); ++it ){
-      EquationPtr eq = it->second;
-      if ( !eq->isEvaluatable() || !eq->evaluate()){
-         evaluatable = false;
-         //std::cout << "Equation is not evaluatable! " << *eq << " -> " << eq->root_->left()->compute() << "!=" << eq->root_->right()->compute() << std::endl;
-         //WALBERLA_LOG_RESULT( "Equation is not evaluatable! " << *eq << " -> " << eq->root_->left()->compute() << "!=" << eq->root_->right()->compute() );
-      }
-   }
-   /*if (evaluatable)
-      //std::cout << "All Equations are evaluatable!" << std::endl;
-      WALBERLA_LOG_RESULT( "All Equations are evaluatable!" );*/
-
-   return evaluatable;
-}
-
-bool EquationSystem::isVarDefined( const std::string& var ) const
-{
-   return varMap_.find(var) != varMap_.end();
-}
-
-double EquationSystem::getVarValue( const std::string& var ) const
-{
-   return varMap_.find(var)->second->getValue();
-}
-
-void EquationSystem::getVarMap( std::map<std::string,double>& varMap ) const
-{
-   for( auto it = varMap_.begin(); it != varMap_.end(); ++it )
-   {
-      varMap.insert( std::pair<std::string,double>(it->first,it->second->getValue()) );
-   }
-}
-
-size_t EquationSystem::getNumberOfEquations() const
-{
-   return eqMap_.size();
-}
-
-std::string EquationSystem::writeEquations() const
-{
-   std::stringstream ss;
-   ss << "Equations to solve:" << std::endl;
-   for( auto it = eqMap_.begin(); it != eqMap_.end(); ++it )
-   {
-      ss << *it->second << std::endl;
-   }
-   return ss.str();
-}
-
-std::string EquationSystem::writeVariables() const
-{
-   std::stringstream ss;
-   ss << "Solution for each variable:" << std::endl;
-   for( auto it = varMap_.begin(); it != varMap_.end(); ++it )
-   {
-      ss << *it->second << std::endl;
-   }
-   return ss.str();
-}
-
-std::ostream& operator<<( std::ostream& os, EquationSystem& es )
-{
-   os << es.writeEquations() << es.writeVariables();
-   return os;
-}
-
-} // namespace math
-} // namespace walberla
-
-#endif
diff --git a/src/core/math/equation_system/EquationSystem.h b/src/core/math/equation_system/EquationSystem.h
deleted file mode 100644
index 82d17a0f79a552aa4d2b1cf13c575484909baf1e..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/EquationSystem.h
+++ /dev/null
@@ -1,127 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file EquationSystem.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FwdEquation.h"
-#include "FwdVariable.h"
-
-#include <map>
-#include <string>
-#include <vector>
-
-
-#ifdef _MSC_VER
-#pragma warning ( push, 1 )
-#endif
-#include <boost/graph/adjacency_list.hpp>
-#ifdef _MSC_VER
-#pragma warning ( pop )
-#endif
-
-
-namespace walberla {
-namespace math {
-
-   //===================================================================================================================
-   //
-   //  CLASS DEFINITION
-   //
-   //===================================================================================================================
-
-   //*******************************************************************************************************************
-   /*!\brief Wrapper class to handle and solve an equation system, e.g. given by the equations
-    * in an input file
-    * \author Matthias Markl
-    *
-    * This class handles and solves the equations given in an input file in terms of a triangular
-    * equation system. In order to do so, it employs boost-graphs to organize the equations in data structures.
-    * Furthermore, equations and the variables that are solved for are hold in map-structures and can be
-    * operated on from outside.
-    *
-    * The equations need to be given in the following form (e.g.):
-    *
-    * "'c'     = 'dx_L' / 'dt_L'"
-    */
-   class EquationSystem
-   {
-   private:
-      // forward declaration of EquationParser class
-      friend class EquationParser;
-
-      using EqGraph = boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS>;
-
-      using EqVertexMap = std::map<std::string, EqGraph::vertex_descriptor>;
-      using VarVertexMap = std::map<std::string, EqGraph::vertex_descriptor>;
-
-      using EqVertexMapIt = std::map<std::string, EqGraph::vertex_descriptor>::const_iterator;
-      using VarVertexMapIt = std::map<std::string, EqGraph::vertex_descriptor>::const_iterator;
-
-      using EqMap = std::map<std::string, EquationPtr>;
-      using EqMapIt = std::map<std::string, EquationPtr>::const_iterator;
-
-      EqMap       eqMap_;
-      EqGraph     eqGraph_;
-      EqVertexMap eqVertices_;
-
-      VarMap       varMap_;
-      VarVertexMap varVertices_;
-   public:
-
-      //**Get functions****************************************************************************
-      /*! \name Get functions */
-      //@{
-      const EquationPtr& get      (const std::string& key) { return eqMap_[key]; }
-      bool   isVarDefined         ( const std::string& var ) const;
-      double getVarValue          ( const std::string& var ) const;
-      void   getVarMap            ( std::map<std::string,double>& varMap ) const;
-      size_t getNumberOfEquations () const;
-      //@}
-      //****************************************************************************************************************
-
-      //**Output functions*************************************************************************
-      /*! \name Output functions */
-      //@{
-      std::string writeEquations() const;
-      std::string writeVariables() const;
-      friend std::ostream& operator<<( std::ostream& os, EquationSystem& es );
-      //@}
-      //****************************************************************************************************************
-
-      //**Utility functions************************************************************************
-      /*! \name Utility functions */
-      //@{
-      void add(const std::string& key, const EquationPtr& eq);
-      void remove(const std::string& key);
-      bool solve();
-      void match();
-      void clear();
-      //void push();
-      //void pop();
-      //@}
-      //****************************************************************************************************************
-
-   };
-
-   using EquationSystemPtr = shared_ptr<EquationSystem>;
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/FwdEquation.h b/src/core/math/equation_system/FwdEquation.h
deleted file mode 100644
index 21fbb523cd241c52d51f60fb6fd3f13c0ca136cf..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/FwdEquation.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FwdEquation.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include <memory>
-
-
-namespace walberla {
-namespace math {
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // NODE
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   enum NodeType {
-      NT_OPERATOR,
-      NT_CONSTANT,
-      NT_VARIABLE
-   };
-
-   enum NodeDir {
-      ND_LEFT,
-      ND_RIGHT
-   };
-
-   class Node;
-   using NodePtr = std::shared_ptr<Node>;
-
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   // EQUATION
-   /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-   class Equation;
-   using EquationPtr = std::shared_ptr<Equation>;
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/FwdOperator.h b/src/core/math/equation_system/FwdOperator.h
deleted file mode 100644
index 062f371cc3eba90cab5aacb9ccdd92e18b290132..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/FwdOperator.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FwdOperator.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include <map>
-
-
-namespace walberla {
-namespace math {
-   class OpType;
-   class OpNo;
-   class OpPlus;
-   class OpMinus;
-   class OpMult;
-   class OpDiv;
-   class OpProd;
-   class OpRoot;
-   class OpLog;
-
-   // no valid operators
-   extern OpNo    OP_NO;
-   extern OpNo    OP_EQUAL;
-
-   // operators
-   extern OpPlus  OP_PLUS;
-   extern OpMinus OP_MINUS;
-   extern OpMult  OP_MULT;
-   extern OpDiv   OP_DIV;
-   extern OpProd  OP_PROD;
-
-   // functions
-   extern OpLog   OP_LOG;
-   extern OpRoot  OP_ROOT;
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/FwdVariable.h b/src/core/math/equation_system/FwdVariable.h
deleted file mode 100644
index bb9e8edf550f014eb01aa904eee6aa1c697c9727..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/FwdVariable.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FwdVariable.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include <memory>
-#include <map>
-#include <string>
-
-
-namespace walberla {
-namespace math {
-
-   extern double NAN_VAL;
-
-   class Var;
-
-   using VarPtr = std::shared_ptr<Var>;
-
-   using VarMap = std::map<std::string, VarPtr>;
-   using VarMapIt = std::map<std::string, VarPtr>::const_iterator;
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/Operator.cpp b/src/core/math/equation_system/Operator.cpp
deleted file mode 100644
index 86c062fc71509325f6520894e98ddd87894bdc44..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Operator.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Operator.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "Operator.h"
-
-
-namespace walberla {
-namespace math {
-
-   // no valid operators
-   OpNo    OP_NO   ( 'n', "no op",  0u );
-   OpNo    OP_EQUAL( '=', "equal",  0u );
-
-   // operators
-   OpPlus  OP_PLUS ( '+', "plus",  10u );
-   OpMinus OP_MINUS( '-', "minus", 10u );
-   OpMult  OP_MULT ( '*', "mult",  30u );
-   OpDiv   OP_DIV  ( '/', "div",   30u );
-   OpProd  OP_PROD ( '^', "prod",  40u );
-
-   // functions
-   OpLog   OP_LOG  ( '$', "log",   50u );
-   //OpRoot  OP_ROOT ( '%', "root",  50u );
-
-
-   int isop( const char c )
-   {
-      return
-         ( OP_PLUS  == c ||
-         OP_MINUS == c ||
-         OP_MULT  == c ||
-         OP_DIV   == c ||
-         OP_PROD  == c    ) ? c : 0;
-   }
-
-   OpType& getOp ( const char c )
-   {
-      if (OP_PLUS  == c) return OP_PLUS;
-      if (OP_MINUS == c) return OP_MINUS;
-      if (OP_MULT  == c) return OP_MULT;
-      if (OP_DIV   == c) return OP_DIV;
-      if (OP_PROD  == c) return OP_PROD;
-      WALBERLA_ABORT( "Found no operator" );
-      return OP_NO; // has no effect
-   }
-
-   std::ostream& operator<<( std::ostream& os, const OpType & type ){
-      if( type == '$' )
-         return os << type.name_;
-      return os << type.sign_;
-   }
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/Operator.h b/src/core/math/equation_system/Operator.h
deleted file mode 100644
index e9a6a75e1ed50196b285212e1b5bed3ddfec4649..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Operator.h
+++ /dev/null
@@ -1,129 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Operator.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FwdOperator.h"
-#include "core/Abort.h"
-
-#include <cmath>
-#include <string>
-
-
-namespace walberla {
-namespace math {
-
-   class OpType
-   {
-   private:
-      const char         sign_;
-      const std::string  name_;
-      const unsigned int strength_;
-
-   public:
-      OpType( const char& sign, const std::string& n, const unsigned int strength ) :
-         sign_(sign), name_(n), strength_(strength) {}
-
-      virtual ~OpType() = default;
-
-   private:
-      OpType& operator=( const OpType& ){ return *this; }
-
-   public:
-      bool operator==( const OpType & type ) const { return sign_ == type.sign_; }
-      bool operator==( const char   & c    ) const { return sign_ == c;          }
-
-      bool operator<( const OpType & type ) const { return strength_ < type.strength_; }
-      bool operator>( const OpType & type ) const { return strength_ > type.strength_; }
-      bool operator<=( const OpType & type ) const { return strength_ <= type.strength_; }
-      bool operator>=( const OpType & type ) const { return strength_ >= type.strength_; }
-
-      virtual double operator() ( const double&, const double& ) = 0;
-
-      friend std::ostream& operator<<( std::ostream& os, const OpType & type );
-
-   public:
-      const std::string & getName() const { return name_; }
-   };
-
-   class OpNo : public OpType{
-   public:
-      OpNo( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double &, const double & ) override { WALBERLA_ABORT( "NO OPERATION" ); return 0; }
-   };
-
-   class OpPlus : public OpType{
-   public:
-      OpPlus( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {};
-      double operator() ( const double & a, const double & b ) override { return a + b; }
-   };
-
-   class OpMinus : public OpType{
-   public:
-      OpMinus( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return a - b; }
-   };
-
-   class OpMult : public OpType{
-   public:
-      OpMult( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return a * b; }
-   };
-
-   class OpDiv : public OpType{
-   public:
-      OpDiv( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return a / b; }
-   };
-
-   class OpProd : public OpType{
-   public:
-      OpProd( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return pow( a, b ); }
-   };
-
-   class OpRoot : public OpType{
-   public:
-      OpRoot( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return pow( a, 1/b ); }
-   };
-
-   class OpLog : public OpType{
-   public:
-      OpLog( const char& sign, const std::string& name, const unsigned int strength ) :
-         OpType( sign, name, strength ) {}
-      double operator() ( const double & a, const double & b ) override { return log10(a) / log10(b); }
-   };
-
-
-   int isop( const char c );
-
-   OpType& getOp ( const char c );
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/Variable.cpp b/src/core/math/equation_system/Variable.cpp
deleted file mode 100644
index fcfd72f9a76751a20f57ef6a896a08e00228e85d..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Variable.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Variable.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "Variable.h"
-
-#include <cmath>
-#include <sstream>
-
-
-namespace walberla {
-namespace math {
-
-   Var::Var ( const std::string& name ) :
-      name_ (name),
-      valid_ (false),
-      value_ (FP_NAN)
-   {}
-
-   void Var::setValue( const double value ){
-      value_ = value;
-      valid_ = !std::isnan( value );
-   }
-
-   bool Var::operator==( const Var& var) const {
-      return name_ == var.name_;
-   }
-
-   std::ostream& operator<<( std::ostream& os, const Var & var ){
-      return os << var.name_ << " = " << var.value_;
-   }
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/Variable.h b/src/core/math/equation_system/Variable.h
deleted file mode 100644
index f80f19d5f9723e48d83c7e23d3c00df141d116de..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/Variable.h
+++ /dev/null
@@ -1,60 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file Variable.h
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FwdVariable.h"
-
-
-namespace walberla {
-namespace math {
-
-   class Var
-   {
-   private:
-      const std::string name_;
-
-      bool   valid_;
-      double value_;
-
-   public:
-      Var ( const std::string& name );
-   private:
-      Var& operator=( const Var& ){ return *this; }
-
-   public:
-      bool operator==( const Var& var) const;
-
-   public:
-            bool         valid() const { return valid_; }
-            double       getValue() const { return value_; }
-      const std::string& getName()  const { return name_;  }
-
-   public:
-      void setValue( const double value );
-
-   public:
-      friend std::ostream& operator<<( std::ostream& os, const Var & var );
-   };
-   // end class Var
-
-} // namespace math
-} // namespace walberla
diff --git a/src/core/math/equation_system/all.h b/src/core/math/equation_system/all.h
deleted file mode 100644
index 5c3cc61ed82ebd39c635b37d4f92bb51e5f19f3c..0000000000000000000000000000000000000000
--- a/src/core/math/equation_system/all.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file all.h
-//! \ingroup core
-//! \author Florian Schornbaum <florian.schornbaum@fau.de>
-//! \brief Collective header file for module core
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "Equation.h"
-#ifdef WALBERLA_BUILD_WITH_BOOST
-#include "EquationParser.h"
-#include "EquationSystem.h"
-#endif
-#include "Operator.h"
-#include "Variable.h"
\ No newline at end of file
diff --git a/src/core/mpi/BufferSystem.h b/src/core/mpi/BufferSystem.h
index 6a531fa907e6f95ceb09d29b6ce31ecb9cbecf2e..04161810a408aab7bc0baf6ddb6159714333494a 100644
--- a/src/core/mpi/BufferSystem.h
+++ b/src/core/mpi/BufferSystem.h
@@ -151,7 +151,7 @@ public:
    void sendAll();
    void send( MPIRank rank );
 
-   iterator begin() { WALBERLA_ASSERT( communicationRunning_); return iterator( *this, true ); }
+   iterator begin() { WALBERLA_ASSERT( communicationRunning_) return iterator( *this, true ); }
    iterator end()   {                                          return iterator( *this, false); }
    //@}
    //*******************************************************************************************************************
@@ -190,7 +190,7 @@ public:
    //@{
    bool isSizeCommunicatedInNextStep() const { return (currentComm_ == &unknownSizeComm_); }
    bool isCommunicationRunning() const       { return communicationRunning_;               }
-   bool isReceiverInformationSet() const     { return currentComm_ != NULL;                }
+   bool isReceiverInformationSet() const     { return currentComm_ != nullptr;                }
    //@}
    //*******************************************************************************************************************
 
diff --git a/src/core/mpi/BufferSystem.impl.h b/src/core/mpi/BufferSystem.impl.h
index 4cbd884ba791bb4d07e3591dc9a6a0101ad657ff..183d29bd86b7412090916e0b2be92c3dc7a4c352 100644
--- a/src/core/mpi/BufferSystem.impl.h
+++ b/src/core/mpi/BufferSystem.impl.h
@@ -231,14 +231,14 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::set<MPIRank> & ran
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISize> & ranksToRecvFrom )
 {
-   WALBERLA_ASSERT( ! communicationRunning_ );
+   WALBERLA_ASSERT( ! communicationRunning_ )
 
    recvInfos_.clear();
    for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it )
    {
       const MPIRank sender       = it->first;
       const MPISize senderSize   = it->second;
-      WALBERLA_ASSERT_GREATER( senderSize, 0 );
+      WALBERLA_ASSERT_GREATER( senderSize, 0 )
       recvInfos_[ sender ].size   = senderSize;
    }
 
diff --git a/src/core/mpi/BufferSystemHelper.h b/src/core/mpi/BufferSystemHelper.h
index 5603db56005c53eff6c7a5c5f2cbd369969b0c90..c505dfa0bd74f939500c1962458b4c1e2c9354fe 100644
--- a/src/core/mpi/BufferSystemHelper.h
+++ b/src/core/mpi/BufferSystemHelper.h
@@ -108,7 +108,7 @@ namespace internal {
       using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo;
 
       KnownSizeCommunication( const MPI_Comm & communicator, int tag = 0 )
-           : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {}
+           : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){}
 
       ~KnownSizeCommunication() override = default;
 
@@ -121,8 +121,8 @@ namespace internal {
       MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override;
 
    private:
-      bool sending_;
-      bool receiving_;
+      bool sending_{false};
+      bool receiving_{false};
 
       std::vector<MPI_Request> sendRequests_;
       std::vector<MPI_Request> recvRequests_;
@@ -136,7 +136,7 @@ namespace internal {
       using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo;
 
       UnknownSizeCommunication( const MPI_Comm & communicator, int tag = 0 )
-           :  AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {}
+           :  AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){}
 
       ~UnknownSizeCommunication() override = default;
 
@@ -149,8 +149,8 @@ namespace internal {
       MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override;
 
    private:
-      bool sending_;
-      bool receiving_;
+      bool sending_{false};
+      bool receiving_{false};
 
       std::vector<MPI_Request> sendRequests_;
       std::list<MPISize>       outgoingBufferForSizes_;
@@ -168,7 +168,7 @@ namespace internal {
       using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo;
 
       UnknownSizeCommunicationIProbe( const MPI_Comm & communicator, int tag = 0 )
-           :  AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {}
+           :  AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){}
 
       ~UnknownSizeCommunicationIProbe() override = default;
 
@@ -181,8 +181,8 @@ namespace internal {
       MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override;
 
    private:
-      bool sending_;
-      bool receiving_;
+      bool sending_{false};
+      bool receiving_{false};
       int  pendingReceives_;
 
       std::vector<MPI_Request> sendRequests_;
@@ -196,7 +196,7 @@ namespace internal {
       using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo;
 
       NoMPICommunication( const MPI_Comm & communicator, int tag = 0 )
-         : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), received_( false ) {}
+         : AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ){}
 
       ~NoMPICommunication() override = default;
 
@@ -209,7 +209,7 @@ namespace internal {
       MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos ) override;
 
    private:
-      bool         received_;
+      bool         received_{ false };
       RecvBuffer_T tmpBuffer_;
    };
 
diff --git a/src/core/mpi/Datatype.h b/src/core/mpi/Datatype.h
index 80b7931ef5311140620a41a777280dd5ae531c80..f717cb6d94c661aec320a864c972dbba4a49d2ae 100644
--- a/src/core/mpi/Datatype.h
+++ b/src/core/mpi/Datatype.h
@@ -40,23 +40,17 @@ namespace mpi {
 
       Datatype( MPI_Datatype datatype) : mpiDatatype_( datatype )
       {
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_commit( &mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
       void init( MPI_Datatype datatype )
       {
          mpiDatatype_ = datatype;
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_commit( &mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_commit( &mpiDatatype_ ); }
       }
 
       ~Datatype() {
-#ifdef WALBERLA_BUILD_WITH_MPI
-         MPI_Type_free( & mpiDatatype_ );
-#endif
+         WALBERLA_MPI_SECTION() { MPI_Type_free( & mpiDatatype_ ); }
       }
 
       operator MPI_Datatype() const {
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index cd250cb97bdf438e48e79c4010d75942e16ba010..6b406c631072d43fc8d95b8a9c7f25e6b0472be6 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -120,6 +120,8 @@ struct MPI_Status
 const int MPI_COMM_NULL  = 0;
 const int MPI_COMM_WORLD = 1;
 
+const int MPI_COMM_TYPE_SHARED = 0;
+
 const int MPI_SUCCESS = 1;
 
 
@@ -202,11 +204,14 @@ inline int MPI_Comm_size( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 inline int MPI_Comm_rank( MPI_Comm, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 inline int MPI_Comm_get_name( MPI_Comm, char*, int* ) { WALBERLA_MPI_FUNCTION_ERROR }
 
-inline int MPI_Comm_group ( MPI_Comm, MPI_Group* )           { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_free  ( MPI_Comm* )                      { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_dup   ( MPI_Comm, MPI_Comm *)            { WALBERLA_MPI_FUNCTION_ERROR }
-inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *)  { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Info_create ( MPI_Info * ) { WALBERLA_MPI_FUNCTION_ERROR }
+
+inline int MPI_Comm_group ( MPI_Comm, MPI_Group* )                         { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_create( MPI_Comm, MPI_Group, MPI_Comm* )               { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_free  ( MPI_Comm* )                                    { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_dup   ( MPI_Comm, MPI_Comm *)                          { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_split ( MPI_Comm, int, int, MPI_Comm *)                { WALBERLA_MPI_FUNCTION_ERROR }
+inline int MPI_Comm_split_type ( MPI_Comm, int, int, MPI_Info, MPI_Comm *) { WALBERLA_MPI_FUNCTION_ERROR }
 
 
 inline int MPI_Cart_create( MPI_Comm, int, int*, int*, int, MPI_Comm* ) { WALBERLA_MPI_FUNCTION_ERROR }
diff --git a/src/core/timing/CMakeLists.txt b/src/core/timing/CMakeLists.txt
index b949b2eeb07612321b6297a53c92fa39c33d0eab..1de08d9623e95fc186687d355cf07e1336a70548 100644
--- a/src/core/timing/CMakeLists.txt
+++ b/src/core/timing/CMakeLists.txt
@@ -16,4 +16,5 @@ target_sources( core
          TimingTree.cpp
          TimingTree.h
          WcPolicy.h
+         DeviceSynchronizePolicy.h
       )
diff --git a/src/core/timing/DeviceSynchronizePolicy.h b/src/core/timing/DeviceSynchronizePolicy.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c494e48d22b1fb195d52b90334b9c0bed0c2f65
--- /dev/null
+++ b/src/core/timing/DeviceSynchronizePolicy.h
@@ -0,0 +1,84 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DeviceSynchronizePolicy.h
+//! \ingroup core
+//! \author Richard Angersbach
+//! \brief Gpu Timing Policy
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/DeviceWrapper.h"
+
+#include "Time.h"
+
+namespace walberla
+{
+namespace timing
+{
+
+//======================================================================================================================
+//
+//  CLASS DEFINITION
+//
+//======================================================================================================================
+
+//**********************************************************************************************************************
+/*!\brief Timing policy for the measurement of the GPU time.
+// \ingroup timing
+//
+// The DeviceSynchronizePolicy class represents the timing policy for GPU time measurements that can be used
+// in combination with the Timer class template. This combination is realized with the DeviceSynchronizePolicy
+// type definition.
+// This class uses device synchronization internally and is therefore not suited for CUDA
+// applications with overlapping kernels.
+*/
+struct DeviceSynchronizePolicy
+{
+ public:
+   //**Timing functions****************************************************************************
+   /*!\name Timing functions */
+   //@{
+   static inline double getTimestamp();
+   //@}
+   //*******************************************************************************************************************
+};
+//**********************************************************************************************************************
+
+//======================================================================================================================
+//
+//  TIMING FUNCTIONS
+//
+//======================================================================================================================
+
+//**********************************************************************************************************************
+/*!\brief Returns a timestamp of the current GPU time in seconds. Uses wall clock time and device synchronization
+internally.
+//
+// \return GPU timestamp in seconds.
+*/
+inline double DeviceSynchronizePolicy::getTimestamp()
+{
+   // synchronize device before getting timestamp
+   WALBERLA_DEVICE_SECTION() { gpuDeviceSynchronize(); }
+
+   return getWcTime();
+}
+//**********************************************************************************************************************
+
+} // namespace timing
+} // namespace walberla
diff --git a/src/core/timing/RemainingTimeLogger.h b/src/core/timing/RemainingTimeLogger.h
index c20b715cdf21ddd612e7b29a43bb12fc4a3b96bf..a669f5a99852a3799ba3be802dc3d4755631d463 100644
--- a/src/core/timing/RemainingTimeLogger.h
+++ b/src/core/timing/RemainingTimeLogger.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -23,12 +23,14 @@
 
 #include "core/logging/Logging.h"
 #include "core/timing/Timer.h"
+#include "core/DataTypes.h"
 
 #include <iomanip>
 
-
-namespace walberla {
-namespace timing {
+namespace walberla
+{
+namespace timing
+{
 
 /***********************************************************************************************************************
  * \brief Functor that can be added to a time loop in order to print an estimated remaining runtime.
@@ -43,18 +45,18 @@ namespace timing {
  **********************************************************************************************************************/
 class RemainingTimeLogger
 {
-public:
-
-   RemainingTimeLogger( const uint_t nrTimesteps, const double logIntervalInSec = 10, const int minOutputWidth = 8, const uint_t startTimestep = 0 )
-      : timeSinceLastLog_( 0.0 ), logIntervalInSec_( logIntervalInSec ),
-        timestep_( startTimestep ), nrTimesteps_( nrTimesteps ), minOutputWidth_( minOutputWidth ), firstRun_( true )
-   {}
-
-   void operator() ()
+ public:
+   RemainingTimeLogger(const uint_t nrTimesteps, const real_t logIntervalInSec = 10, const int minOutputWidth = 8,
+                       const uint_t startTimestep = 0)
+      : logIntervalInSec_(logIntervalInSec), timestep_(startTimestep), nrTimesteps_(nrTimesteps),
+        minOutputWidth_(minOutputWidth)
+   { WALBERLA_UNUSED(minOutputWidth_); }
+
+   void operator()()
    {
       WALBERLA_ROOT_SECTION()
       {
-         if( firstRun_ )
+         if (firstRun_)
          {
             timer_.start();
             firstRun_ = false;
@@ -65,37 +67,38 @@ public:
          timer_.end();
          ++timestep_;
 
-         timeSinceLastLog_ += timer_.last();
+         timeSinceLastLog_ += real_c(timer_.last());
 
-         if( timeSinceLastLog_ > logIntervalInSec_)
+         if (timeSinceLastLog_ > logIntervalInSec_)
          {
             timeSinceLastLog_ = 0.0;
 
-            uint_t timeStepsRemaining = nrTimesteps_ - timestep_;
+            uint_t const timeStepsRemaining = nrTimesteps_ - timestep_;
 
-            double remainingTime = timer_.average() * double_c( timeStepsRemaining );
-            WALBERLA_LOG_INFO( "Estimated Remaining Time: " << std::setw( minOutputWidth_ ) << std::right
-                                                            << timing::timeToString( real_c(remainingTime) ) );
+            real_t const remainingTime = real_c(timer_.average()) * real_c(timeStepsRemaining);
+            WALBERLA_UNUSED(remainingTime);
+            WALBERLA_LOG_INFO("Estimated Remaining Time: " << std::setw(minOutputWidth_) << std::right
+                                                           << timing::timeToString(remainingTime));
          }
 
          timer_.start();
       }
    }
 
-private:
-
+ private:
    WcTimer timer_;
-   double  timeSinceLastLog_;
-   double  logIntervalInSec_;
-   uint_t  timestep_;
-   uint_t  nrTimesteps_;
-   int     minOutputWidth_;
-   bool    firstRun_;
+   real_t timeSinceLastLog_{ 0.0 };
+   real_t logIntervalInSec_;
+   uint_t timestep_;
+   uint_t nrTimesteps_;
+   int minOutputWidth_;
+   bool firstRun_{ true };
 };
 
 } // namespace timing
 } // namespace walberla
 
-namespace walberla {
-   using timing::RemainingTimeLogger;
-}
+namespace walberla
+{
+using timing::RemainingTimeLogger;
+}
\ No newline at end of file
diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h
index 89568b4f5ca8c509168c2d24ff99984fe42babee..9f7c3f97d1066ff7ffa322cb5d6550c9f5d5013b 100644
--- a/src/core/timing/Timer.h
+++ b/src/core/timing/Timer.h
@@ -25,14 +25,17 @@
 #pragma once
 
 #include "CpuPolicy.h"
+#include "DeviceSynchronizePolicy.h"
 #include "ReduceType.h"
 #include "WcPolicy.h"
-#include "core/DataTypes.h"
 
+#include "core/DataTypes.h"
 #include "core/mpi/RecvBuffer.h"
 #include "core/mpi/Reduce.h"
 #include "core/mpi/SendBuffer.h"
 
+#include "gpu/DeviceWrapper.h"
+
 #include <iomanip>
 #include <iostream>
 #include <limits>
@@ -500,7 +503,7 @@ shared_ptr<Timer<TP> > getReduced( Timer<TP>& timer, ReduceType rt, int targetRa
       break;
 
    default:
-      WALBERLA_ABORT( "Unknown reduce type" );
+      WALBERLA_ABORT( "Unknown reduce type" )
       break;
    }
 
@@ -590,6 +593,7 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, Timer<TP
 } //namespace timing
 
 using CpuTimer = timing::Timer<timing::CpuPolicy>;
+using DeviceSynchronizeTimer = timing::Timer<timing::DeviceSynchronizePolicy>;
 using WcTimer = timing::Timer<timing::WcPolicy>;
 
 } // namespace walberla
diff --git a/src/core/timing/TimingNode.cpp b/src/core/timing/TimingNode.cpp
index 3e0cf4df5cd6c45ff7cc61ed8fb8f14651a9d0de..c75cd141483d6915a38487c16aeed6255c3da7bd 100644
--- a/src/core/timing/TimingNode.cpp
+++ b/src/core/timing/TimingNode.cpp
@@ -29,6 +29,7 @@ namespace timing {
 
 // Explicit instantiation
 template struct TimingNode<WcPolicy>;
+template struct TimingNode<DeviceSynchronizePolicy>;
 template struct TimingNode<CpuPolicy>;
 
 } // namespace timing
diff --git a/src/core/timing/TimingNode.h b/src/core/timing/TimingNode.h
index a72c2f193378d94dd47f292f9758dd247758e553..0b6326e71096625d30ab69b2c850702473f7c04c 100644
--- a/src/core/timing/TimingNode.h
+++ b/src/core/timing/TimingNode.h
@@ -120,7 +120,7 @@ void TimingNode<TP>::swap(TimingNode<TP>& tt)
 template< typename TP >  // Timing policy
 const Timer<TP>& findTimer( const TimingNode<TP>& tn, const std::string& name)
 {
-   auto pos = name.find_first_of(".");
+   auto pos = name.find_first_of('.');
    if (pos != std::string::npos)
    {
       WALBERLA_ASSERT_UNEQUAL( tn.tree_.find(name.substr(0, pos)), tn.tree_.end(), "Could not find timer: " << name.substr(0, pos) );
@@ -139,7 +139,7 @@ const Timer<TP>& findTimer( const TimingNode<TP>& tn, const std::string& name)
 template< typename TP >  // Timing policy
 bool timerExists( const TimingNode<TP>& tn, const std::string& name )
 {
-   auto pos = name.find_first_of(".");
+   auto pos = name.find_first_of('.');
    if (pos != std::string::npos)
    {
       if( tn.tree_.find(name.substr(0, pos)) != tn.tree_.end() )
@@ -494,6 +494,7 @@ void addRemainderNodes(timing::TimingNode<TP> &tn) {
 }
 
 using WcTimingNode = timing::TimingNode<timing::WcPolicy>;
+using DeviceSynchronizeTimingNode = timing::TimingNode<timing::DeviceSynchronizePolicy>;
 using CpuTimingNode = timing::TimingNode<timing::CpuPolicy>;
 
 }
diff --git a/src/core/timing/TimingPool.cpp b/src/core/timing/TimingPool.cpp
index dff973201aa5e3976576c60038200cba4492fd6b..28cf668f2d08741bf2ec265726969a4974ab7480 100644
--- a/src/core/timing/TimingPool.cpp
+++ b/src/core/timing/TimingPool.cpp
@@ -116,7 +116,7 @@ shared_ptr<TimingPool<TP> > TimingPool<TP>::getReduced( ReduceType rt, int targe
          break;
 
       default:
-         WALBERLA_ABORT( "Unknown reduce type" );
+         WALBERLA_ABORT( "Unknown reduce type" )
          break;
    }
 
@@ -474,6 +474,7 @@ void TimingPool<TP>::clear ()
 
 // Explicit instantiation
 template class TimingPool<WcPolicy>;
+template class TimingPool<DeviceSynchronizePolicy>;
 template class TimingPool<CpuPolicy>;
 
 
diff --git a/src/core/timing/TimingPool.h b/src/core/timing/TimingPool.h
index 5e41c14d783067b68f4fa76e3a789a0a6728c0bf..2d5ed09960a9b0ec1018abeab4295faf7aaf681e 100644
--- a/src/core/timing/TimingPool.h
+++ b/src/core/timing/TimingPool.h
@@ -249,5 +249,6 @@ namespace timing {
 
 namespace walberla {
    using WcTimingPool = timing::TimingPool<timing::WcPolicy>;
+   using DeviceSynchronizeTimingPool = timing::TimingPool<timing::DeviceSynchronizePolicy>;
    using CpuTimingPool = timing::TimingPool<timing::CpuPolicy>;
 }
diff --git a/src/core/timing/TimingTree.cpp b/src/core/timing/TimingTree.cpp
index fc891c31aad6acfa8778cdc13f9f6a1e1e7d5983..14cd472326dbf2a7182a81777188b2738458abef 100644
--- a/src/core/timing/TimingTree.cpp
+++ b/src/core/timing/TimingTree.cpp
@@ -29,6 +29,7 @@ namespace timing {
 
 // Explicit instantiation
 template class TimingTree<WcPolicy>;
+template class TimingTree<DeviceSynchronizePolicy>;
 template class TimingTree<CpuPolicy>;
 
 } // namespace timing
diff --git a/src/core/timing/TimingTree.h b/src/core/timing/TimingTree.h
index 248dcdcced8edbf32743f3cf6ae507e66ae857de..5cf06167e00875f9ead6300ac7f1750d4f9f376a 100644
--- a/src/core/timing/TimingTree.h
+++ b/src/core/timing/TimingTree.h
@@ -144,7 +144,7 @@ void TimingTree<TP>::swap(TimingTree<TP>& tt)
 template< typename TP >  // Timing policy
 void TimingTree<TP>::start(const std::string& name)
 {
-   if (name.find_first_of(".") != std::string::npos)
+   if (name.find_first_of('.') != std::string::npos)
    {
       WALBERLA_LOG_WARNING("'.' not allowed in timer name!");
    }
@@ -158,7 +158,7 @@ void TimingTree<TP>::start(const std::string& name)
 template< typename TP >  // Timing policy
 void TimingTree<TP>::stop(const std::string& name)
 {
-   if (name.find_first_of(".") != std::string::npos)
+   if (name.find_first_of('.') != std::string::npos)
    {
       WALBERLA_LOG_WARNING("'.' not allowed in timer name!");
    }
@@ -259,5 +259,6 @@ TimingTree< TP > TimingTree< TP >::getCopyWithRemainder() const
 }
 
 using WcTimingTree = timing::TimingTree<timing::WcPolicy>;
+using DeviceSynchronizeTimingTree = timing::TimingTree<timing::DeviceSynchronizePolicy>;
 using CpuTimingTree = timing::TimingTree<timing::CpuPolicy>;
 }
diff --git a/src/core/uid/UIDGenerators.h b/src/core/uid/UIDGenerators.h
index 4d4a333f9800ffdd88f32fa6e277f710ea21615d..265beeaac89babc29eade08b85a6012157f5377d 100644
--- a/src/core/uid/UIDGenerators.h
+++ b/src/core/uid/UIDGenerators.h
@@ -166,7 +166,8 @@ template< typename UINT > size_t logBase2( UINT value ); // 1000 -> 3, 0010 -> 1
 
 template<> inline size_t logBase2< uint64_t >( uint64_t value ) {
 
-   uint64_t i, j;
+   uint64_t i;
+   uint64_t j;
 
    i = value >> 32;
    if( i != 0 ) {
@@ -189,7 +190,8 @@ template<> inline size_t logBase2< uint64_t >( uint64_t value ) {
 
 template<> inline size_t logBase2< uint32_t >( uint32_t value ) {
 
-   uint32_t i, j;
+   uint32_t i;
+   uint32_t j;
 
    j = value >> 16;
    if( j != 0 ) {
@@ -273,7 +275,7 @@ public:
 
    static uint_type firstUID() { return 1; }
 
-   static uint_type nextUID( const uint_type uid ) { WALBERLA_ASSERT( false ); return 1; }
+   static uint_type nextUID( const uint_type /*uid*/ ) { WALBERLA_ASSERT( false ); return 1; }
 
    static uint_type toIndex( const uint_type uid ) { WALBERLA_ASSERT_EQUAL( uid, 1 ); return 0; }
 
diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h
deleted file mode 100644
index 5e1d7a3e717b3d390ea19bf9c3861f5d31c6f7f6..0000000000000000000000000000000000000000
--- a/src/cuda/CudaRAII.h
+++ /dev/null
@@ -1,115 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file CudaRAII.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-#pragma once
-
-#include "ErrorChecking.h"
-
-namespace walberla {
-namespace cuda {
-
-
-   class StreamRAII
-   {
-   public:
-      ~StreamRAII()
-      {
-         if( stream_ != 0 ) {
-            WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
-         }
-      }
-
-      StreamRAII( StreamRAII &&other )
-      {
-         stream_ = other.stream_;
-         other.stream_ = 0;
-      }
-
-      StreamRAII( const StreamRAII & ) = delete;
-
-      void operator=( const StreamRAII & ) = delete;
-
-      operator cudaStream_t() const { return stream_; }
-
-
-      static StreamRAII defaultStream()
-      {
-         StreamRAII result;
-         result.stream_ = 0;
-         return result;
-      }
-
-      static StreamRAII newPriorityStream( int priority )
-      {
-         StreamRAII result;
-         WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
-         return result;
-      }
-
-      static StreamRAII newStream()
-      {
-         StreamRAII result;
-         WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_ ));
-         return result;
-      }
-
-   private:
-      StreamRAII() {}
-
-      cudaStream_t stream_;
-   };
-
-
-   class EventRAII
-   {
-   public:
-      explicit EventRAII()
-      {
-         event = cudaEvent_t();
-         WALBERLA_CUDA_CHECK( cudaEventCreate( &event ));
-      }
-
-      ~EventRAII()
-      {
-         if( event != cudaEvent_t() )
-         {
-            WALBERLA_CUDA_CHECK( cudaEventDestroy( event ));
-         }
-      }
-
-      EventRAII( const EventRAII & ) = delete;
-
-      void operator=( const EventRAII & ) = delete;
-
-      EventRAII( EventRAII &&other )
-      {
-         event = other.event;
-         other.event = cudaEvent_t();
-      }
-
-      operator cudaEvent_t() const { return event; }
-
-   private:
-      cudaEvent_t event;
-   };
-
-
-} // namespace cuda
-} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ErrorChecking.h b/src/cuda/ErrorChecking.h
deleted file mode 100644
index 82dc0b4a913936eb2c4b8f9d01ccc0c1e0df2528..0000000000000000000000000000000000000000
--- a/src/cuda/ErrorChecking.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ErrorChecking.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "core/Abort.h"
-
-#include <sstream>
-#include <cuda_runtime.h>
-
-
-namespace walberla {
-namespace cuda {
-
-
-#define WALBERLA_CUDA_CHECK(ans) { ::walberla::cuda::checkForError((ans), __FILE__, __LINE__); }
-
-
-
-inline void checkForError( cudaError_t code, const std::string & callerPath, const int line )
-{
-  if(code != cudaSuccess)
-  {
-    std::stringstream ss;
-    ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code );
-    Abort::instance()->abort( ss.str(), callerPath, line );
-  }
-}
-
-
-
-} // namespace cuda
-} // namespace walberla
-
-
diff --git a/src/cuda/ExecutionTreeGPU.h b/src/cuda/ExecutionTreeGPU.h
deleted file mode 100644
index 9c865378cda0ce1125883ad1970dbda11d286f61..0000000000000000000000000000000000000000
--- a/src/cuda/ExecutionTreeGPU.h
+++ /dev/null
@@ -1,203 +0,0 @@
-//==============================================================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ExecutionTreeGPU.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//==============================================================================================================================================================
-
-#pragma once
-
-#include "executiontree/ExecutionTree.h"
-#include "ParallelStreams.h"
-
-#include <cuda_runtime.h>
-
-#ifdef CUDART_VERSION
-#if CUDART_VERSION <= 9020
-cudaError_t cudaLaunchHostFunc( cudaStream_t,  void(CUDART_CB* )( void*  userData ), void* ) {
-        static bool printedWarning = false;
-        if( ! printedWarning ) {
-                WALBERLA_LOG_WARNING_ON_ROOT("Timing of CUDA functions only implemented for CUDA versions >= 10.0" );
-                printedWarning = true;
-        }
-        return cudaSuccess;
-}
-#endif
-#endif
-
-namespace walberla {
-namespace executiontree {
-
-// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
-
-using executiontree::IFunctionNode;
-using executiontree::IFunctionNodePtr;
-using executiontree::TimingTreePtr;
-
-class SequenceCUDA;
-class IFunctionNodeCUDA;
-template<typename FunctorClass> class FunctorCUDA;
-using IFunctionNodeCUDAPtr = shared_ptr<IFunctionNodeCUDA>;
-
-
-// -------------------------------------- Public Interface     ------------------------------------------------------------------------------------------------
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
-
-
-shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
-                                         const std::string &name, cudaStream_t defaultStream = 0, bool parallel = false, int priority = 0,
-                                         const TimingTreePtr &timingTree = nullptr );
-
-
-// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
-
-
-class IFunctionNodeCUDA : public IFunctionNode
-{
-public:
-   virtual void operator()( cudaStream_t ) = 0;
-};
-
-template<typename FunctorClass>
-void CUDART_CB functorCUDAStartTimer(void *data)
-{
-   auto functor = reinterpret_cast<FunctorClass *>( data );
-   functor->timingTree_->start( functor->getName() );
-}
-
-template<typename FunctorClass>
-void CUDART_CB functorCUDAStopTimer(void *data)
-{
-   auto functor = reinterpret_cast<FunctorClass *>( data );
-   functor->timingTree_->stop( functor->getName() );
-}
-
-template<typename FunctorType>
-class FunctorCUDA : public IFunctionNodeCUDA
-{
-public:
-   FunctorCUDA( const FunctorType &functor,
-                const std::string &name,
-                const TimingTreePtr &timingTree )
-      : functor_( functor ), name_( name ), timingTree_( timingTree ) {}
-
-   void operator() (cudaStream_t stream) override
-   {
-      if ( timingTree_ )
-      {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer<FunctorCUDA<FunctorType> >, this ) );
-         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer<FunctorCUDA<FunctorType> >, this ) );
-      }
-      else
-         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
-   }
-
-   const std::string getName() const override { return name_ != "" ? name_ : "FunctorCUDA"; };
-   void operator() () override {  (*this)( 0 );  }
-
-private:
-   friend void CUDART_CB functorCUDAStartTimer<FunctorCUDA<FunctorType> >(void *data);
-   friend void CUDART_CB functorCUDAStopTimer<FunctorCUDA<FunctorType> >(void *data);
-
-   FunctorType functor_;
-   std::string name_;
-   shared_ptr< WcTimingTree > timingTree_;
-};
-
-
-class SequenceCUDA : public IFunctionNodeCUDA
-{
-public:
-   SequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList, const std::string &name, cudaStream_t defaultStream,
-                 bool parallel = false, int priority=0,
-                 const TimingTreePtr &timingTree = nullptr)
-      : name_( name ), defaultStream_( defaultStream), timingTree_( timingTree ), parallelStreams_( priority ), parallel_( parallel ), priority_(priority)
-   {
-      for ( auto &e : initializerList )
-         children_.push_back( e );
-   }
-
-
-   void operator() (cudaStream_t stream) override
-   {
-      if ( timingTree_ ) {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer< SequenceCUDA >, this ));
-      }
-
-      if( parallel_ )
-      {
-         auto parallelSection = parallelStreams_.parallelSection( stream );
-         for ( auto &el : children_ )
-         {
-            ( *el )( parallelSection.stream());
-            parallelSection.next();
-         }
-      }
-      else
-         for ( auto &el : children_ )
-            (*el)( stream );
-
-      if ( timingTree_ ) {
-         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer< SequenceCUDA >, this ));
-      }
-   }
-
-   void operator() () override {  (*this)( defaultStream_ );  }
-   void push_back( const IFunctionNodeCUDAPtr &fct ) { children_.push_back( fct ); }
-   void push_front( const IFunctionNodeCUDAPtr &fct ) { children_.push_front( fct ); }
-   const std::string getName() const override { return name_ != "" ? name_ : "ParallelSequenceCUDA"; };
-   const std::deque< IFunctionNodePtr > getChildren() const override {
-      std::deque< IFunctionNodePtr > result;
-      for( auto & c : children_ )
-         result.push_back( c );
-      return result;
-   };
-
-private:
-   friend void CUDART_CB functorCUDAStartTimer< SequenceCUDA >( void *data );
-   friend void CUDART_CB functorCUDAStopTimer< SequenceCUDA >( void *data );
-
-   std::string name_;
-   cudaStream_t defaultStream_;
-   std::deque< IFunctionNodeCUDAPtr > children_;
-   shared_ptr< WcTimingTree > timingTree_;
-   cuda::ParallelStreams parallelStreams_;
-   bool parallel_;
-   int priority_;
-};
-
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
-{
-   return make_shared<FunctorCUDA<FunctorType> >( t, name, timingTree );
-}
-
-
-shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
-                                         const std::string &name, cudaStream_t defaultStream, bool parallel, int priority,
-                                         const TimingTreePtr &timingTree )
-{
-   return make_shared< SequenceCUDA >( initializerList, name, defaultStream, parallel, priority, timingTree );
-}
-
-
-} // namespace executiontree
-} // namespace walberla
diff --git a/src/cuda/ExecutionTreeSweepGPU.h b/src/cuda/ExecutionTreeSweepGPU.h
deleted file mode 100644
index 6f97277c4b75a5fe2dcdf0ec383ed8217699f1b2..0000000000000000000000000000000000000000
--- a/src/cuda/ExecutionTreeSweepGPU.h
+++ /dev/null
@@ -1,105 +0,0 @@
-//==============================================================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ExecutionTreeSweepGPU.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//==============================================================================================================================================================
-
-#pragma once
-
-#include "domain_decomposition/IBlock.h"
-#include "executiontree/ExecutionTree.h"
-#include "ExecutionTreeGPU.h"
-
-namespace walberla {
-namespace executiontree {
-
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name = "",
-                                const TimingTreePtr &tt = nullptr );
-
-
-template<typename FunctorType>
-class SweepCUDA : public IFunctionNodeCUDA
-{
-public:
-   SweepCUDA( BlockStorage &bs,
-              const FunctorType &functor,
-              const std::string &name,
-              const TimingTreePtr &timingTree )
-      : blockStorage_( bs ),
-        functor_( functor ),
-        name_( name ),
-        timingTree_( timingTree ) {}
-
-   SweepCUDA( const shared_ptr <StructuredBlockStorage> &bs,
-              const FunctorType &functor,
-              const std::string &name,
-              const TimingTreePtr &timingTree )
-      : blockStorage_( bs->getBlockStorage()),
-        functor_( functor ),
-        name_( name ),
-        timingTree_( timingTree ) {}
-
-   void operator() () override {  (*this)( 0 );  }
-
-   void operator()( cudaStream_t stream ) override
-   {
-      if ( timingTree_ )
-      {
-         for ( auto &block: blockStorage_ )
-         {
-            timingTree_->start( name_ );
-            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
-            timingTree_->stop( name_ );
-         }
-      }
-      else
-         for ( auto &block: blockStorage_ )
-            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
-   }
-
-   const std::string getName() const override { return name_ != "" ? name_ : "Sweep"; };
-
-private:
-   BlockStorage &blockStorage_;
-
-   FunctorType functor_;
-   std::string name_;
-   TimingTreePtr timingTree_;
-};
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, FunctorType t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
-{
-   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
-}
-
-template<typename FunctorType>
-IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name,
-                                const TimingTreePtr &timingTree )
-{
-   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
-}
-
-
-} // namespace executiontree
-} // namespace walberla
diff --git a/src/cuda/FieldCopy.h b/src/cuda/FieldCopy.h
deleted file mode 100644
index 4f13fa999ffec19249183fecd2a4fe0939e2674e..0000000000000000000000000000000000000000
--- a/src/cuda/FieldCopy.h
+++ /dev/null
@@ -1,213 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FieldCopy.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "ErrorChecking.h"
-#include "GPUField.h"
-
-#include "domain_decomposition/StructuredBlockStorage.h"
-#include "field/Field.h"
-#include "field/GhostLayerField.h"
-
-#include "core/Abort.h"
-#include "core/logging/Logging.h"
-
-#include <cuda_runtime.h>
-
-namespace walberla {
-namespace cuda {
-
-
-   template<typename DstType, typename SrcType>
-   void fieldCpy( const shared_ptr< StructuredBlockStorage > & blocks,  BlockDataID dstID, ConstBlockDataID srcID )
-   {
-      for ( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
-      {
-               DstType * dst = blockIt->getData<DstType>( dstID );
-         const SrcType * src = blockIt->getData<SrcType>( srcID );
-         fieldCpy( *dst, *src );
-      }
-   }
-
-   template<typename DstType, typename SrcType>
-   std::function<void()> fieldCpyFunctor( const shared_ptr< StructuredBlockStorage > & blocks,
-                                            BlockDataID dstID, ConstBlockDataID srcID )
-   {
-      return std::bind( fieldCpy<DstType,SrcType>, blocks, dstID, srcID );
-   }
-
-
-
-   template<typename DstType, typename SrcType>
-   void fieldCpySweepFunction( BlockDataID dstID, ConstBlockDataID srcID, IBlock * block )
-   {
-            DstType * dst = block->getData<DstType>( dstID );
-      const SrcType * src = block->getData<SrcType>( srcID );
-      fieldCpy( *dst, *src );
-   }
-
-   template<typename DstType, typename SrcType>
-   std::function<void(IBlock*)> fieldCpyFunctor( BlockDataID dstID, ConstBlockDataID srcID )
-   {
-      return std::bind( fieldCpySweepFunction<DstType,SrcType>, dstID, srcID, std::placeholders::_1 );
-   }
-
-
-
-
-
-   template<typename T, uint_t fs>
-   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src );
-
-
-
-   template<typename T, uint_t fs>
-   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src );
-
-
-
-
-   //===================================================================================================================
-   //
-   //  Implementation
-   //
-   //===================================================================================================================
-
-
-
-
-   template<typename T, uint_t fs>
-   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src )
-   {
-      cudaMemcpy3DParms p;
-      memset( &p, 0, sizeof(p) );
-
-
-      if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" );
-      }
-
-      bool canCopy = ( src.layout()     == fzyx &&
-                       dst.fAllocSize() == src.fAllocSize() &&
-                       dst.zAllocSize() == src.zAllocSize() &&
-                       dst.yAllocSize() == src.yAllocSize() &&
-                       dst.xSize()      == src.xSize() )
-                      ||
-                      ( src.layout()     == zyxf &&
-                        dst.zAllocSize() == src.zAllocSize() &&
-                        dst.yAllocSize() == src.yAllocSize() &&
-                        dst.xAllocSize() == src.xAllocSize() &&
-                        dst.fSize()      == src.fSize() );
-
-      if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ");
-      }
-
-      if ( dst.layout() == fzyx )
-      {
-         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
-                                         sizeof(T) * src.xAllocSize(), // pitch
-                                         src.xAllocSize(),             // inner dimension size
-                                         src.yAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
-         p.extent.height = dst.yAllocSize();
-         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
-      }
-      else
-      {
-         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
-                                         sizeof(T) * src.fAllocSize(), // pitch
-                                         src.fAllocSize(),             // inner dimension size
-                                         src.xAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
-         p.extent.height = dst.xAllocSize();
-         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
-      }
-
-      p.dstPtr = dst.pitchedPtr();
-      p.kind = cudaMemcpyHostToDevice;
-      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
-   }
-
-
-
-   template<typename T, uint_t fs>
-   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src )
-   {
-      cudaMemcpy3DParms p;
-      memset( &p, 0, sizeof(p) );
-
-      if ( dst.layout() != src.layout() ) {
-         WALBERLA_ABORT( "Cannot copy fields with different layout" );
-      }
-
-      bool canCopy = ( src.layout()     == fzyx &&
-                       dst.fAllocSize() == src.fAllocSize() &&
-                       dst.zAllocSize() == src.zAllocSize() &&
-                       dst.yAllocSize() == src.yAllocSize() &&
-                       dst.xSize()      == src.xSize() )
-                      ||
-                      ( src.layout()     == zyxf &&
-                        dst.zAllocSize() == src.zAllocSize() &&
-                        dst.yAllocSize() == src.yAllocSize() &&
-                        dst.xAllocSize() == src.xAllocSize() &&
-                        dst.fSize()      == src.fSize() );
-
-      if ( !canCopy ) {
-         WALBERLA_ABORT("Field have to have the same size ");
-      }
-
-      if ( dst.layout() == fzyx )
-      {
-         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
-                                         sizeof(T) * dst.xAllocSize(), // pitch
-                                         dst.xAllocSize(),             // inner dimension size
-                                         dst.yAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
-         p.extent.height = dst.yAllocSize();
-         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
-      }
-      else
-      {
-         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
-                                         sizeof(T) * dst.fAllocSize(), // pitch
-                                         dst.fAllocSize(),             // inner dimension size
-                                         dst.xAllocSize()  );          // next outer dimension size
-
-         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
-         p.extent.height = dst.xAllocSize();
-         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
-      }
-
-      p.srcPtr = src.pitchedPtr();
-      p.kind = cudaMemcpyDeviceToHost;
-      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
-
-   }
-
-} // namespace cuda
-} // namespace walberla
-
-
diff --git a/src/cuda/FieldIndexing.h b/src/cuda/FieldIndexing.h
deleted file mode 100644
index 229d3b36b752d86826699430bf6734f8de8a8c4c..0000000000000000000000000000000000000000
--- a/src/cuda/FieldIndexing.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file FieldIndexing.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "FieldAccessor.h"
-
-#include "stencil/Directions.h"
-#include <cuda_runtime.h>
-
-namespace walberla { namespace cell {  class CellInterval;  } }
-
-namespace walberla {
-namespace cuda {
-
-   // Forward Declarations
-   template< typename T> class GPUField;
-
-   template<typename T>
-   class FieldIndexing
-   {
-   public:
-
-      //** Kernel call        ******************************************************************************************
-      /*! \name Kernel call  */
-      //@{
-      dim3 blockDim() const                      { return blockDim_; }
-      dim3 gridDim () const                      { return gridDim_;  }
-
-      const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; }
-      //@}
-      //****************************************************************************************************************
-
-
-
-
-      //** Creation        *********************************************************************************************
-      /*! \name Creation  */
-      //@{
-      static FieldIndexing<T> interval ( const GPUField<T> & f,
-                                               const cell::CellInterval & ci,
-                                               int fBegin=0, int fEnd=1 );
-
-
-      static FieldIndexing<T> xyz ( const GPUField<T> & f );
-      static FieldIndexing<T> withGhostLayerXYZ       ( const GPUField<T> & f, uint_t numGhostLayers );
-      static FieldIndexing<T> ghostLayerOnlyXYZ       ( const GPUField<T> & f, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-      static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-      static FieldIndexing<T> sliceXYZ                ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
-                                                              stencil::Direction dir, bool fullSlice = false );
-
-      static FieldIndexing<T> allInner          ( const GPUField<T> & f );
-      static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f );
-      static FieldIndexing<T> all               ( const GPUField<T> & f, const cell::CellInterval & ci );
-      //@}
-      //****************************************************************************************************************
-
-   protected:
-      FieldIndexing ( const GPUField<T> & field,
-                      dim3 _blockDim, dim3 _gridDim,
-                      const FieldAccessor<T> _gpuAccess );
-
-      const GPUField<T> &  field_;
-      dim3 blockDim_;
-      dim3 gridDim_;
-      FieldAccessor<T> gpuAccess_;
-   };
-
-
-} // namespace cuda
-} // namespace walberla
-
-#include "FieldIndexing.impl.h"
-
diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp
deleted file mode 100644
index 834150fdcb921064392fb99d6690752b6eaaee3c..0000000000000000000000000000000000000000
--- a/src/cuda/GPUCopy.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUCopy.cpp
-//! \ingroup cuda
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
-//! \brief Copy routines of 4D intervals involving GPU buffers.
-//
-//======================================================================================================================
-
-#include "core/debug/Debug.h"
-
-#include "GPUCopy.h"
-#include "ErrorChecking.h"
-
-#include <cstring>
-
-
-namespace walberla {
-namespace cuda {
-
-void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
-
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyDeviceToDevice;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, uint_c(1) );
-      }
-   }
-}
-
-
-void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-      p.kind = cudaMemcpyDeviceToDevice;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-
-void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyHostToDevice;
-
-      if (copyStream == 0)
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, uint_c(1) );
-      }
-   }
-}
-
-void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-         cudaMemcpy3DParms p;
-         std::memset( &p, 0, sizeof(p) );
-
-         p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-         p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
-
-         p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-         p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
-
-         p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-         p.kind = cudaMemcpyHostToDevice;
-
-         if ( copyStream == 0 )
-         {
-            WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-         }
-         else
-         {
-            // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-            WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-         }
-   };
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-
-void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
-      p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
-
-      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
-      p.kind = cudaMemcpyDeviceToHost;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
-   {
-      copyFunctor( dstF, srcF, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyFunctor( dstF + f, srcF + f, 1 );
-      }
-   }
-}
-
-
-void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
-                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
-                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream )
-{
-   const uint_t & Nx = std::get<0>(intervalSize),
-                & Ny = std::get<1>(intervalSize),
-                & Nz = std::get<2>(intervalSize),
-                & Nf = std::get<3>(intervalSize);
-
-   const uint_t & srcX = std::get<0>(srcOffset),
-                & srcY = std::get<1>(srcOffset),
-                & srcZ = std::get<2>(srcOffset),
-                & srcF = std::get<3>(srcOffset);
-
-   const uint_t & dstX = std::get<0>(dstOffset),
-                & dstY = std::get<1>(dstOffset),
-                & dstZ = std::get<2>(dstOffset),
-                & dstF = std::get<3>(dstOffset);
-
-   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
-      cudaMemcpy3DParms p;
-      std::memset( &p, 0, sizeof(p) );
-
-      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
-      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
-
-      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
-      p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
-
-      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
-
-      p.kind = cudaMemcpyDeviceToHost;
-
-      if ( copyStream == 0 )
-      {
-         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
-      }
-      else
-      {
-         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
-         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
-      }
-   };
-
-
-   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
-   {
-      copyFunctor( dstZ, srcZ, Nz );
-   }
-   else
-   {
-      for( uint_t z = 0; z < Nz; ++z )
-      {
-         copyFunctor( dstZ + z, srcZ + z, 1 );
-      }
-   }
-}
-
-} // namespace cuda
-} // namespace walberla
diff --git a/src/cuda/NVTX.h b/src/cuda/NVTX.h
deleted file mode 100644
index a8c1210b827b89d28f5d1491a84adaa9f020432e..0000000000000000000000000000000000000000
--- a/src/cuda/NVTX.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file NVTX.h
-//! \ingroup cuda
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-#include "core/DataTypes.h"
-
-#include <string>
-
-#include <nvToolsExt.h>
-#include <nvToolsExtCuda.h>
-#include <nvToolsExtCudaRt.h>
-
-namespace walberla{
-namespace cuda {
-
-inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa)
-{
-    nvtxEventAttributes_t eventAttrib;
-    memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
-    eventAttrib.version = NVTX_VERSION;
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-    eventAttrib.colorType = NVTX_COLOR_ARGB;
-    eventAttrib.color = 0xFF000000 | color;
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-    eventAttrib.message.ascii = name.c_str();
-    nvtxMarkEx(&eventAttrib);
-}
-
-inline void nameStream(const cudaStream_t & stream, const std::string & name)
-{
-    nvtxNameCudaStreamA(stream, name.c_str());
-}
-
-class NvtxRange
-{
-public:
-    NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa)
-    {
-        memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
-        eventAttrib.version = NVTX_VERSION;
-        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-        eventAttrib.colorType = NVTX_COLOR_ARGB;
-        eventAttrib.color = 0xFF000000 | color;
-        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-        eventAttrib.message.ascii = name.c_str();
-        nvtxRangePushEx(&eventAttrib);
-    }
-    ~NvtxRange()
-    {
-        nvtxRangePop();
-    }
-private:
-    nvtxEventAttributes_t eventAttrib;
-};
-
-
-} // namespace cuda
-} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/MemcpyPackInfo.h b/src/cuda/communication/MemcpyPackInfo.h
deleted file mode 100644
index 20637b51a36385b21e49cd257a4a41bdaf1ea0a8..0000000000000000000000000000000000000000
--- a/src/cuda/communication/MemcpyPackInfo.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#pragma once
-
-#include "stencil/Directions.h"
-#include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
-#include "core/DataTypes.h"
-#include "domain_decomposition/IBlock.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
-
-
-namespace walberla {
-namespace cuda {
-namespace communication {
-
-template<typename GPUFieldType>
-class MemcpyPackInfo : public ::walberla::cuda::GeneratedGPUPackInfo
-{
-public:
-    MemcpyPackInfo( BlockDataID pdfsID_ )
-        : pdfsID(pdfsID_), numberOfGhostLayers_(0), communicateAllGhostLayers_(true)
-    {};
-    virtual ~MemcpyPackInfo() = default;
-
-    void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream) override;
-    void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream) override;
-    uint_t size(stencil::Direction dir, IBlock * block) override;
-
-private:
-    BlockDataID pdfsID;
-    uint_t numberOfGhostLayers_;
-    bool communicateAllGhostLayers_;
-
-    uint_t numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const;
-};
-
-} // namespace communication
-} // namespace cuda
-} // namespace walberla
-
-#include "MemcpyPackInfo.impl.h"
diff --git a/src/cuda/doc/cuda.dox b/src/cuda/doc/cuda.dox
deleted file mode 100644
index 96652834d67ca2a6caa342855d77981f52d7f214..0000000000000000000000000000000000000000
--- a/src/cuda/doc/cuda.dox
+++ /dev/null
@@ -1,80 +0,0 @@
-
-namespace walberla{
-/*!
-
-\page cudaPage Overview of waLBerla CUDA support
-
-\brief waLBerla CUDA concepts
-
-
-\section cudaField Fields on GPU
-
-
-\subsection cudaFieldOverview Creating GPU fields and copy them between host and device
-
-   \code
-   // create a CPU field and a GPU field of same size and with same layout
-   GhostLayerField<double,4> h_f ( 16,20,30,    1, 42.0, field::fzyx );
-   cuda::GPUField<double>    d_f ( 16,20,30, 4, 1,  field::fzyx );
-
-   cuda::fieldCpy( d_f, h_f ); // copy from host to device
-   some_kernel_wrapper( d_f ); // run some kernel
-   cuda::fieldCpy( h_f, d_f ); // copy field data back to host
-   \endcode
-
-   Similarities and Differences of CPU and GPU field
-   - cuda::GPUField corresponds to field::GhostLayerField
-   - fSize is a template parameter for CPU fields and a normal parameter for GPUFields
-   - CPU field iterators correspond to FieldAccessors (see next section)
-
-\subsection cudaFieldAccess Writing CUDA kernels operating on GPUFields
-
-  \image html cuda/doc/fieldAccess.png "Accessing fields in CUDA kernels"
-
-   When writing a kernel that operates on a field, the first task is to distribute the data to CUDA threads and blocks.
-   We need a function $(blockIdx, threadIdx) \\rightarrow (x,y,z)$ or $(blockIdx, threadIdx) \\rightarrow (x,y,z,f)$.
-   The optimal mapping depends on many parameters: for example which layout the field has, the extends of each coordinate,
-   hardware parameters like warp-size, etc.
-   Thus this indexing function is abstracted. A few indexing strategies are already implemented which can be
-   substituted by custom strategies.
-   A indexing strategy consists of two classes: and somewhat complex Indexing class, which manages the
-   indexing on the host-side and a lightweight Accessor class, which is passed to the CUDA kernel.
-
-   An indexing scheme is very similar to the iterator concept, it defines the bounds of the iteration, which is not necessarily the
-   complete field but could also be a certain sub-block, for example the ghost layer in a certain direction.
-
-
-   Lets start to write a simple kernel that doubles all values stored in a field:
-   \code
-   #include "cuda/FieldAccessor.h"
-
-   __global__ void kernel_double( cuda::FieldAccessor<double> f )
-   {
-      f.set( blockIdx, threadIdx );
-      f.get() *= 2.0;
-   }
-   \endcode
-   We do not have to care about indexing, the cuda::FieldAccessor takes care of that. So this is a generic kernel that operates
-   on double fields. Using the cuda::FieldAccessor the current and neighboring values can be accessed and manipulated.
-
-   This kernel can be called like this:
-   \code
-   cuda::FieldIndexing<double> indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true );
-   kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() );
-   \endcode
-   In the example above we only iterate over a slice of the field. Of course we can also iterate over the complete field, there are
-   various static member functions in a Indexing class to create certain iteration patterns.
-   The Indexing class encapsulates the information of how to launch the kernel (blockDim and gridDim) and holds the Accessor class that
-   is passed to the kernel.
-
-   Two indexing strategies are currently provided:
-      - cuda::FieldIndexing   and  cuda::FieldAccessor (general, but slow )
-      - cuda::FieldIndexingXYZ  and cuda::FieldAccessorXYZ ( optimized for cell based iterating over bigger chunks, for fields where xSize bigger than warpSize )
-
- \section cudaKernelWrapper Calling CUDA kernels from CPP files
-      \copydoc cuda::Kernel
-
-
-
-*/
-}
diff --git a/src/domain_decomposition/BlockDataHandling.h b/src/domain_decomposition/BlockDataHandling.h
index 0720eb572ffadafc2deb78b3733e4cf5ff225029..56b18521f0c65656b3b09b4ec5ff0a430c39c312 100644
--- a/src/domain_decomposition/BlockDataHandling.h
+++ b/src/domain_decomposition/BlockDataHandling.h
@@ -92,21 +92,21 @@ public:
    
    void serialize( IBlock * const, const BlockDataID &, mpi::SendBuffer & ) override
    {
-      WALBERLA_ABORT( "You are trying to serialize a block data item for which only an initialization function was registered" );
+      WALBERLA_ABORT( "You are trying to serialize a block data item for which only an initialization function was registered" )
 #ifdef __IBMCPP__
-      return NULL; // never reached, helps to suppress a warning from the IBM compiler
+      return nullptr; // never reached, helps to suppress a warning from the IBM compiler
 #endif
    }
    T * deserialize( IBlock * const ) override
    {
-      WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" );
+      WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" )
 #ifdef __IBMCPP__
-      return NULL; // never reached, helps to suppress a warning from the IBM compiler
+      return nullptr; // never reached, helps to suppress a warning from the IBM compiler
 #endif
    }
    void deserialize( IBlock * const, const BlockDataID &, mpi::RecvBuffer & ) override
    {
-      WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" );
+      WALBERLA_ABORT( "You are trying to deserialize a block data item for which only an initialization function was registered" )
    }
 
 private:
@@ -175,27 +175,27 @@ public:
   
    BlockData * initialize( IBlock * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->initialize( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->serialize( block, id, buffer );
    }
    
    BlockData * deserialize( IBlock * const block ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       T * ptr = dataHandling_->deserialize( block );
       return ptr ? new BlockData( ptr ) : nullptr;
    }
    
    void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
       dataHandling_->deserialize( block, id, buffer );
    }   
    
@@ -269,7 +269,7 @@ public:
                          " - block state: " << block->getState() << "\n"
                          " - global state: " << uid::globalState() << "\n"
                          " - additional state: " << state << "\n" 
-                         " - \"selector\": " << selection );
+                         " - \"selector\": " << selection )
       }
       
       return dataHandling;
diff --git a/src/domain_decomposition/IBlock.h b/src/domain_decomposition/IBlock.h
index 06e48b6905194a71ed3970550ca65ee525876080..ef563cc057b2e9157330420f09beb1ad74050168 100644
--- a/src/domain_decomposition/IBlock.h
+++ b/src/domain_decomposition/IBlock.h
@@ -110,7 +110,7 @@ public:
       WALBERLA_ABORT( "BlockData access type violation! (The block data you added is of a different type than the block data you are trying to access!)" )
 #endif
 #ifdef __IBMCPP__
-      return NULL; // never reached, helps to suppress a warning from the IBM compiler
+      return nullptr; // never reached, helps to suppress a warning from the IBM compiler
 #endif
    }
 
@@ -212,8 +212,6 @@ public:
    friend class           BlockStorage;
    friend class StructuredBlockStorage;
 
-public:
-
    virtual const IBlockID& getId() const = 0;
 
    bool operator==( const IBlock& rhs ) const;
@@ -466,7 +464,7 @@ inline const T* IBlock::uncheckedFastGetData( const ConstBlockDataID & index ) c
    WALBERLA_ASSERT_LESS( uint_t( index ), data_.size() );
 
    if( data_[index] == nullptr )
-      return NULL;
+      return nullptr;
 
    return data_[index]->template uncheckedFastGet< T >();
 }
diff --git a/src/domain_decomposition/StructuredBlockStorage.h b/src/domain_decomposition/StructuredBlockStorage.h
index 574634255f49d6f969c82a108c98f0dde7582132..146a5eadb5186fb283652c834d4579ca91482e73 100644
--- a/src/domain_decomposition/StructuredBlockStorage.h
+++ b/src/domain_decomposition/StructuredBlockStorage.h
@@ -284,9 +284,9 @@ public:
 
 
 
-   real_t dx( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dx_.size() ); return dx_[ level ]; } ///< cell size on level "level" in x direction
-   real_t dy( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dy_.size() ); return dy_[ level ]; } ///< cell size on level "level" in y direction
-   real_t dz( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dz_.size() ); return dz_[ level ]; } ///< cell size on level "level" in z direction
+   real_t dx( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dx_.size() ) return dx_[ level ]; } ///< cell size on level "level" in x direction
+   real_t dy( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dy_.size() ) return dy_[ level ]; } ///< cell size on level "level" in y direction
+   real_t dz( const uint_t level = 0 ) const { WALBERLA_ASSERT_LESS( level, dz_.size() ) return dz_[ level ]; } ///< cell size on level "level" in z direction
 
    void mapToPeriodicDomain( Cell& cell, const uint_t level = 0 ) const; // -> for documentation of this function see StructuredBlockStorage.cpp
 
@@ -354,7 +354,7 @@ public:
    /// Returns the block data ID required for accessing the cell bounding box of blocks - fails in debug mode if no block cell bounding boxes
    /// have been created via "createCellBoundingBoxes()". (remember: every block resides on exactly one grid level, and all blocks managed by a
    //  structured block storage are assigned a corresponding cell bounding box as block data once "createCellBoundingBoxes()" is called.)
-   inline ConstBlockDataID getBlockCellBBId() const { WALBERLA_ASSERT( blockCellBBCreated_ ); return blockCellBBId_; }
+   inline ConstBlockDataID getBlockCellBBId() const { WALBERLA_ASSERT( blockCellBBCreated_ ) return blockCellBBId_; }
 
    inline const CellInterval& getBlockCellBB( const IBlock& block ) const;
 
@@ -488,7 +488,7 @@ inline bool StructuredBlockStorage::operator==( const StructuredBlockStorage& rh
 
 inline const CellInterval& StructuredBlockStorage::getDomainCellBB( const uint_t level ) const {
 
-   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() );
+   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() )
 
    return domainCellBB_[ level ];
 }
@@ -497,7 +497,7 @@ inline const CellInterval& StructuredBlockStorage::getDomainCellBB( const uint_t
 
 inline uint_t StructuredBlockStorage::getNumberOfXCells( const uint_t level ) const {
 
-   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() );
+   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() )
 
    return uint_c( domainCellBB_[ level ].xMax() + 1 );
 }
@@ -506,7 +506,7 @@ inline uint_t StructuredBlockStorage::getNumberOfXCells( const uint_t level ) co
 
 inline uint_t StructuredBlockStorage::getNumberOfYCells( const uint_t level ) const {
 
-   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() );
+   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() )
 
    return uint_c( domainCellBB_[ level ].yMax() + 1 );
 }
@@ -515,7 +515,7 @@ inline uint_t StructuredBlockStorage::getNumberOfYCells( const uint_t level ) co
 
 inline uint_t StructuredBlockStorage::getNumberOfZCells( const uint_t level ) const {
 
-   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() );
+   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() )
 
    return uint_c( domainCellBB_[ level ].zMax() + 1 );
 }
@@ -524,8 +524,8 @@ inline uint_t StructuredBlockStorage::getNumberOfZCells( const uint_t level ) co
 
 inline uint_t StructuredBlockStorage::getNumberOfCells( const uint_t index, const uint_t level ) const {
 
-   WALBERLA_ASSERT_LESS( index, uint_t(3) );
-   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() );
+   WALBERLA_ASSERT_LESS( index, uint_t(3) )
+   WALBERLA_ASSERT_LESS( level, domainCellBB_.size() )
 
    return uint_c( domainCellBB_[ level ].max()[ index ] + 1 );
 }
@@ -689,7 +689,9 @@ inline AABB StructuredBlockStorage::getAABBFromCellBB( const CellInterval& cellB
 //**********************************************************************************************************************
 inline const IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t level ) const {
 
-   real_t x, y, z;
+   real_t x;
+   real_t y;
+   real_t z;
    getCellCenter( x, y, z, cell, level );
 
    const IBlock* block = blockStorage_->getBlock(x,y,z);
@@ -712,7 +714,9 @@ inline const IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const u
 //**********************************************************************************************************************
 inline IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t level ) {
 
-   real_t x, y, z;
+   real_t x;
+   real_t y;
+   real_t z;
    getCellCenter( x, y, z, cell, level );
 
    IBlock* block = blockStorage_->getBlock(x,y,z);
@@ -736,8 +740,8 @@ inline IBlock* StructuredBlockStorage::getBlock( const Cell& cell, const uint_t
 //**********************************************************************************************************************
 inline const CellInterval& StructuredBlockStorage::getBlockCellBB( const IBlock& block ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
-   WALBERLA_ASSERT( blockCellBBCreated_ );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
+   WALBERLA_ASSERT( blockCellBBCreated_ )
 
    return *(block.uncheckedFastGetData< CellInterval >( blockCellBBId_ ));
 }
@@ -769,12 +773,12 @@ inline Cell StructuredBlockStorage::getBlockLocalCell( const IBlock& block, cons
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::getBlockLocalCell( Cell& localCell, const IBlock& block, const real_t x, const real_t y, const real_t z ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const AABB & aabb  = block.getAABB();
    const uint_t level = getLevel( block );
 
-   WALBERLA_ASSERT_LESS( level, levels_ );
+   WALBERLA_ASSERT_LESS( level, levels_ )
 
    localCell.x() = cell_idx_c( std::floor( ( x - aabb.xMin() ) / dx( level ) ) );
    localCell.y() = cell_idx_c( std::floor( ( y - aabb.yMin() ) / dy( level ) ) );
@@ -805,12 +809,12 @@ inline Vector3< real_t > StructuredBlockStorage::getBlockLocalCellCenter( const
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::getBlockLocalCellCenter( const IBlock & block, const Cell & localCell, real_t & x, real_t & y, real_t & z ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const AABB & aabb  = block.getAABB();
    const uint_t level = getLevel( block );
 
-   WALBERLA_ASSERT_LESS( level, levels_ );
+   WALBERLA_ASSERT_LESS( level, levels_ )
 
    x = aabb.xMin() + ( real_c( localCell.x() ) + real_c(0.5) ) * dx( level );
    y = aabb.yMin() + ( real_c( localCell.y() ) + real_c(0.5) ) * dy( level );
@@ -842,12 +846,12 @@ inline AABB StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block,
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block, const Cell & localCell, AABB & aabb ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const AABB& blockAABB = block.getAABB();
    const uint_t level = getLevel( block );
 
-   WALBERLA_ASSERT_LESS( level, levels_ );
+   WALBERLA_ASSERT_LESS( level, levels_ )
 
    const real_t x = blockAABB.xMin() + real_c( localCell.x() ) * dx( level );
    const real_t y = blockAABB.yMin() + real_c( localCell.y() ) * dy( level );
@@ -866,7 +870,7 @@ inline void StructuredBlockStorage::getBlockLocalCellAABB( const IBlock & block,
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> & local, const IBlock& block, const Vector3<real_t> & global ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const uint_t level = getLevel( block );
 
@@ -886,7 +890,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t>
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t> & point, const IBlock& block ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const uint_t level = getLevel( block );
 
@@ -906,7 +910,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocal( Vector3<real_t>
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> & global, const IBlock& block, const Vector3<real_t> & local ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const uint_t level = getLevel( block );
 
@@ -927,7 +931,7 @@ inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t>
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t> &  point, const IBlock& block ) const
 {
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const uint_t level = getLevel( block );
       
@@ -949,7 +953,7 @@ inline void StructuredBlockStorage::transformBlockLocalToGlobal( Vector3<real_t>
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformGlobalToBlockLocalCell( Cell& local, const IBlock& block, const Cell& global ) const {
 
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const CellInterval& cellBB = getBlockCellBB( block );
 
@@ -981,7 +985,7 @@ inline void StructuredBlockStorage::transformGlobalToBlockLocalCell( Cell& cell,
 //**********************************************************************************************************************
 inline void StructuredBlockStorage::transformBlockLocalToGlobalCell( Cell& global, const IBlock& block, const Cell& local ) const {
 
-   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) );
+   WALBERLA_ASSERT_EQUAL( blockStorage_.get(), &(block.getBlockStorage()) )
 
    const CellInterval& cellBB = getBlockCellBB( block );
 
diff --git a/src/field/AddToStorage.h b/src/field/AddToStorage.h
index c60484babbe8d55e8c35c212779c22580e2d9c5e..d1ef11d921e4e305ada51f590a0a1035be5d6ec5 100644
--- a/src/field/AddToStorage.h
+++ b/src/field/AddToStorage.h
@@ -150,7 +150,7 @@ template< typename GhostLayerField_T, typename BlockStorage_T >
 BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
                           const std::string & identifier,
                           const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-                          const Layout layout = zyxf,
+                          const Layout layout = fzyx,
                           const uint_t nrOfGhostLayers = uint_t(1),
                           const bool alwaysInitialize = false,
                           const std::function< void ( GhostLayerField_T * field, IBlock * const block ) > & initFunction =
@@ -163,6 +163,24 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
 }
 
 
+template< typename GhostLayerField_T, typename BlockStorage_T >
+BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
+                         const std::string & identifier,
+                         const typename GhostLayerField_T::value_type & initValue,
+                         const Layout layout,
+                         const uint_t nrOfGhostLayers,
+                         const shared_ptr< field::FieldAllocator<typename GhostLayerField_T::value_type> > alloc)
+{
+   auto alwaysInitialize = false;
+   auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >();
+   auto requiredSelectors = Set<SUID>::emptySet();
+   auto incompatibleSelectors = Set<SUID>::emptySet();
+   auto calculateSize = internal::defaultSize;
+
+   return internal::AddToStorage< GhostLayerField_T, BlockStorage_T >::add( blocks, identifier, initValue, layout, nrOfGhostLayers,
+                                                                            alwaysInitialize, initFunction, requiredSelectors, incompatibleSelectors, calculateSize, alloc );
+}
+
 
 template< typename GhostLayerField_T, typename BlockStorage_T >
 BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
@@ -173,9 +191,9 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
                           const bool alwaysInitialize,
                           const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
 {
+   auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >();
    return addToStorage< GhostLayerField_T >( blocks, identifier, initValue, layout, nrOfGhostLayers, alwaysInitialize,
-                                             std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(),
-                                             requiredSelectors, incompatibleSelectors );
+                                             initFunction, requiredSelectors, incompatibleSelectors );
 }
 
 
@@ -185,7 +203,7 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
                           const std::string & identifier,
                           const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize,
                           const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-                          const Layout layout = zyxf,
+                          const Layout layout = fzyx,
                           const uint_t nrOfGhostLayers = uint_t(1),
                           const bool alwaysInitialize = false,
                           const std::function< void ( GhostLayerField_T * field, IBlock * const block ) > & initFunction =
@@ -210,9 +228,9 @@ BlockDataID addToStorage( const shared_ptr< BlockStorage_T > & blocks,
                           const bool alwaysInitialize,
                           const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
 {
+   auto initFunction = std::function< void ( GhostLayerField_T * field, IBlock * const block ) >();
    return addToStorage< GhostLayerField_T >( blocks, identifier, initValue, layout, nrOfGhostLayers, alwaysInitialize,
-                                             std::function< void ( GhostLayerField_T * field, IBlock * const block ) >(),
-                                             requiredSelectors, incompatibleSelectors, calculateSize );
+                                             initFunction, requiredSelectors, incompatibleSelectors, calculateSize );
 }
 
 
@@ -297,7 +315,7 @@ struct Creator : public domain_decomposition::BlockDataCreator< GhostLayerField_
             const Set<SUID> & requiredSelectors,
             const Set<SUID> & incompatibleSelectors,
             const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-            const Layout layout = zyxf,
+            const Layout layout = fzyx,
             const uint_t nrOfGhostLayers = uint_t(1),
             const bool /*alwaysInitialize*/ = false,
             const std::function< void ( GhostLayerField_T * field, IBlock * const block ) > & initFunction =
@@ -315,7 +333,7 @@ struct Creator : public domain_decomposition::BlockDataCreator< GhostLayerField_
             const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
             const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
             const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-            const Layout layout = zyxf,
+            const Layout layout = fzyx,
             const uint_t nrOfGhostLayers = uint_t(1) ) :
       domain_decomposition::BlockDataCreator< GhostLayerField_T >( shared_ptr< DefaultBlockDataHandling< GhostLayerField_T > >(),
                                                                    identifier, requiredSelectors, incompatibleSelectors )
@@ -336,7 +354,7 @@ struct Creator< GhostLayerField_T,
             const Set<SUID> & requiredSelectors,
             const Set<SUID> & incompatibleSelectors,
             const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-            const Layout layout = zyxf,
+            const Layout layout = fzyx,
             const uint_t nrOfGhostLayers = uint_t(1),
             const bool alwaysInitialize = false,
             const std::function< void ( GhostLayerField_T * field, IBlock * const block ) > & initFunction =
@@ -362,7 +380,7 @@ struct Creator< GhostLayerField_T,
             const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
             const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
             const typename GhostLayerField_T::value_type & initValue = typename GhostLayerField_T::value_type(),
-            const Layout layout = zyxf,
+            const Layout layout = fzyx,
             const uint_t nrOfGhostLayers = uint_t(1) ) :
       domain_decomposition::BlockDataCreator< GhostLayerField_T >( shared_ptr< DefaultBlockDataHandling< GhostLayerField_T > >(),
                                                                    identifier, requiredSelectors, incompatibleSelectors )
diff --git a/src/field/EvaluationFilter.h b/src/field/EvaluationFilter.h
index 7c9ac2c1e8fd9c66e778bba5b85a362922bd6cd3..1cd0f57f96c5f45c723e2ef2f5aed0a5d99b5aaa 100644
--- a/src/field/EvaluationFilter.h
+++ b/src/field/EvaluationFilter.h
@@ -74,13 +74,13 @@ public:
    void operator()( const IBlock & block )
    {
       flagField_ = block.template getData< const FlagField_T >( flagFieldId_ );
-      WALBERLA_ASSERT_NOT_NULLPTR( flagField_ );
+      WALBERLA_ASSERT_NOT_NULLPTR( flagField_ )
       evaluationMask_ = flagField_->getMask( cellsToEvaluate_ );
    }
 
    bool operator()( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z ) const
    {
-      WALBERLA_ASSERT_NOT_NULLPTR( flagField_ );
+      WALBERLA_ASSERT_NOT_NULLPTR( flagField_ )
       return flagField_->isPartOfMaskSet( x, y, z, evaluationMask_ );
    }
 
diff --git a/src/field/Field.h b/src/field/Field.h
index a01fc76cddc32539c7e482e0159a249f1189c54f..9a8e33c3ae9ba9aabec6c0f67f6503dbf6990c3b 100644
--- a/src/field/Field.h
+++ b/src/field/Field.h
@@ -100,18 +100,18 @@ namespace field {
 
 
       Field( uint_t xSize, uint_t ySize, uint_t zSize,
-             const Layout & layout = zyxf,
+             const Layout & layout = fzyx,
              const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
       Field( uint_t xSize, uint_t ySize, uint_t zSize,
-             const T & initValue, const Layout & layout = zyxf,
+             const T & initValue, const Layout & layout = fzyx,
              const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
       Field( uint_t xSize, uint_t ySize, uint_t zSize,
-             const std::vector<T> & fValues, const Layout & layout = zyxf,
+             const std::vector<T> & fValues, const Layout & layout = fzyx,
              const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
       virtual ~Field();
 
 
-      void init( uint_t xSize, uint_t ySize, uint_t zSize, const Layout & layout = zyxf,
+      void init( uint_t xSize, uint_t ySize, uint_t zSize, const Layout & layout = fzyx,
                  shared_ptr<FieldAllocator<T> > alloc = shared_ptr<FieldAllocator<T> >(),
                  uint_t innerGhostLayerSizeForAlignedAlloc = 0 );
 
diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h
index 0a0fdb63801fefddc5577c023119726f3559e245..af93706e467f1735c8989230779b1d5a378293cd 100644
--- a/src/field/Field.impl.h
+++ b/src/field/Field.impl.h
@@ -284,7 +284,7 @@ namespace field {
         zfact_            ( other.zfact_*cell_idx_t(fSize_/fSize2) ),
         allocator_        ( std::shared_ptr<FieldAllocator<T>>(other.allocator_, reinterpret_cast<FieldAllocator<T>*>(other.allocator_.get())) )
    {
-      WALBERLA_CHECK_EQUAL(layout_, Layout::zyxf);
+      WALBERLA_CHECK_EQUAL(layout_, Layout::zyxf)
       static_assert(fSize_ % fSize2 == 0, "number of field components do not match");
       static_assert(std::is_same<typename Field<T2,fSize2>::FlattenedField, Field<T,fSize_>>::value, "field types are incompatible for flattening");
       allocator_->incrementReferenceCount ( values_ );
@@ -357,7 +357,7 @@ namespace field {
 
       layout_ = l;
 
-      WALBERLA_ASSERT(layout_ == zyxf || layout_ == fzyx);
+      WALBERLA_ASSERT(layout_ == zyxf || layout_ == fzyx)
 
       if (layout_ == fzyx ) {
          values_ = allocator_->allocate(fSize_, zSize_, ySize_, xSize_, zAllocSize_, yAllocSize_, xAllocSize_);
diff --git a/src/field/GhostLayerField.h b/src/field/GhostLayerField.h
index 381a06ccd9caffc903ebe7cd5f45b8f4f2c3de9b..345d497cdc4c04a789d46d391b909f3fb0322aa5 100644
--- a/src/field/GhostLayerField.h
+++ b/src/field/GhostLayerField.h
@@ -78,13 +78,13 @@ namespace field {
 
 
       GhostLayerField( uint_t xSize, uint_t ySize, uint_t zSize, uint_t gl,
-                      const Layout & layout = zyxf,
+                      const Layout & layout = fzyx,
                       const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
       GhostLayerField( uint_t xSize, uint_t ySize, uint_t zSize, uint_t gl,
-                       const T & initValue, const Layout & layout = zyxf,
+                       const T & initValue, const Layout & layout = fzyx,
                        const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
       GhostLayerField( uint_t xSize, uint_t ySize, uint_t zSize, uint_t gl,
-                       const std::vector<T> & fValues, const Layout & layout = zyxf,
+                       const std::vector<T> & fValues, const Layout & layout = fzyx,
                        const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
 
       ~GhostLayerField() override = default;
@@ -95,7 +95,7 @@ namespace field {
                  uint_t ySizeWithoutGhostLayer,
                  uint_t zSizeWithoutGhostLayer,
                  uint_t nrGhostLayers,
-                 const Layout & layout = zyxf,
+                 const Layout & layout = fzyx,
                  const shared_ptr<FieldAllocator<T> > &alloc = shared_ptr<FieldAllocator<T> >() );
 
 
@@ -202,6 +202,20 @@ namespace field {
       //@}
       //****************************************************************************************************************
 
+      //** TimestepInformation *****************************************************************************************
+      /*! \name TimestepCounter */
+      //@{
+      inline uint8_t advanceTimestep()
+      {
+         timestepCounter_ = (timestepCounter_ + 1) & 1;
+         return timestepCounter_;
+      }
+      inline uint8_t getTimestep() const { return timestepCounter_; }
+      inline uint8_t getTimestepPlusOne() const { return (timestepCounter_ + 1) & 1; }
+      inline bool isEvenTimeStep() const {return (((timestepCounter_) &1) ^ 1); }
+      //@}
+      //****************************************************************************************************************
+
    protected:
       GhostLayerField( );
 
@@ -221,6 +235,8 @@ namespace field {
 
       template <typename T2, uint_t fSize2>
       friend class GhostLayerField;
+
+      uint8_t timestepCounter_;
    };
 
 } // namespace field
diff --git a/src/field/GhostLayerField.impl.h b/src/field/GhostLayerField.impl.h
index d594274e78e27034b7a3d495e02a49dcaf8da28c..1a8b758ca7a9f717ed0786c4edbb7ed24195a410 100644
--- a/src/field/GhostLayerField.impl.h
+++ b/src/field/GhostLayerField.impl.h
@@ -44,7 +44,7 @@ namespace field {
     *******************************************************************************************************************/
    template<typename T, uint_t fSize_>
    GhostLayerField<T,fSize_>::GhostLayerField( )
-      : gl_(0)
+      : gl_(0), timestepCounter_(0)
    {
    }
 
@@ -128,6 +128,7 @@ namespace field {
                                           const Layout & l, const shared_ptr<FieldAllocator<T> > &alloc)
     {
        gl_ = gl;
+       timestepCounter_ = uint8_c(0);
        uint_t innerGhostLayerSize = ( l == fzyx ) ? gl : uint_t(0);
        Field<T,fSize_>::init( _xSize + 2*gl ,
                               _ySize + 2*gl,
@@ -689,7 +690,7 @@ namespace field {
    template<typename T, uint_t fSize_>
    GhostLayerField<T,fSize_>::GhostLayerField(const GhostLayerField<T,fSize_> & other)
       : Field<T,fSize_>::Field(other),
-        gl_( other.gl_ )
+        gl_( other.gl_ ), timestepCounter_( other.timestepCounter_ )
    {
    }
 
@@ -700,7 +701,7 @@ namespace field {
    template <typename T2, uint_t fSize2>
    GhostLayerField<T,fSize_>::GhostLayerField(const GhostLayerField<T2,fSize2> & other)
       : Field<T,fSize_>::Field(other),
-        gl_( other.gl_ )
+        gl_( other.gl_ ), timestepCounter_( other.timestepCounter_ )
    {
    }
 
@@ -756,11 +757,11 @@ namespace field {
 
       // Assert that there is still space for ghost-layers after slicing
 
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->xAllocSize() - uint_c(this->xOff()) - this->xSize(), gl_ );
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->yAllocSize() - uint_c(this->yOff()) - this->ySize(), gl_ );
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->zAllocSize() - uint_c(this->zOff()) - this->zSize(), gl_ );
 
    }
@@ -771,11 +772,11 @@ namespace field {
       Field<T,fSize_>::shiftCoordinates( cx, cy, cz );
 
       // Assert that there is still space for ghost-layers after slicing
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->xOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->xAllocSize() - uint_c(this->xOff()) - this->xSize(), gl_ );
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->yOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->yAllocSize() - uint_c(this->yOff()) - this->ySize(), gl_ );
-      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ );
+      WALBERLA_ASSERT_GREATER_EQUAL( uint_c(this->zOff()), gl_ )
       WALBERLA_ASSERT_GREATER_EQUAL( this->zAllocSize() - uint_c(this->zOff()) - this->zSize(), gl_ );
    }
 
diff --git a/src/field/StabilityChecker.h b/src/field/StabilityChecker.h
index dde22f11302c4a9e0ddf05fc53e9c050f48eec7e..37493707da7f300154aaea0382e6bc027944bf24 100644
--- a/src/field/StabilityChecker.h
+++ b/src/field/StabilityChecker.h
@@ -78,6 +78,12 @@ inline bool stabilityCheckerIsFinite( const Vector3<real_t> & value ) { return m
 *   about all cells that contain non-finite vales can be logged via the Logging or saved as VTK output for further
 *   investigation.
 *
+*   It is important to be aware that checking for non-finite values will not work when using FASTMATH:
+*   https://stackoverflow.com/questions/22931147/stdisinf-does-not-work-with-ffast-math-how-to-check-for-infinity
+*   https://community.intel.com/t5/Intel-C-Compiler/icx-2021-3-0-bug-isinf-wrong-result/m-p/1316407#M39279
+*
+*   Thus a different checkFunction must be used for the StabilityChecker when FASTMATH is enabled.
+*
 *   Do not create objects of class StabilityChecker directly, better use one of the various 'makeStabilityChecker'
 *   functions below!
 *
@@ -133,7 +139,7 @@ inline bool stabilityCheckerIsFinite( const Vector3<real_t> & value ) { return m
 *   for immediate registration at a time loop (see field::makeSharedFunctor).
 */
 
-template< typename Field_T, typename Filter_T = DefaultEvaluationFilter >
+template< typename Field_T, typename Filter_T = DefaultEvaluationFilter, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )>>
 class StabilityChecker
 {
 private:
@@ -188,8 +194,8 @@ private:
 
       uint8_t evaluate( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const cell_idx_t f ) override
       {
-         WALBERLA_ASSERT( map_.find( this->block_ ) != map_.end() );
-         WALBERLA_ASSERT( map_[ this->block_ ].find( Cell(x,y,z) ) != map_[ this->block_ ].end() );
+         WALBERLA_ASSERT( map_.find( this->block_ ) != map_.end() )
+         WALBERLA_ASSERT( map_[ this->block_ ].find( Cell(x,y,z) ) != map_[ this->block_ ].end() )
 
          return ( map_[ this->block_ ][ Cell(x,y,z) ].find( f ) != map_[ this->block_ ][ Cell(x,y,z) ].end() ) ? uint8_t(1) : uint8_t(0);
       }
@@ -240,6 +246,11 @@ public:
    *                                'checkFrequency'-th time. Setting 'checkFrequency' to 1 means the stability check
    *                                is performed each time operator()() is called. Setting 'checkFrequency' to 0
    *                                disables the check entirely.
+   *   \param checkFunction         If a checkFunction is provided it is used to check each value per cell. The
+   *                                checkFunction has the signature "std::function<bool ( const typename Field_T::value_type & value )>".
+   *                                By default the checkFunction checks in each cell math::finite.
+   *                                However, this will not work if the program is compiled with fast math because NaN
+   *                                is not defined then.
    *   \param outputToStream        If true, in case a non-finite value is detected in the field, information about the
    *                                corresponding cells is logged via WALBERLA_LOG_WARNING.
    *   \param outputVTK             If true, in case a non-finite value is detected in the field, VTK output is
@@ -248,41 +259,65 @@ public:
    *   \param incompatibleSelectors Incompatible selectors
    */
    //*******************************************************************************************************************
-   StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
-                     const Filter_T & filter, const uint_t checkFrequency,
-                     const bool outputToStream = true, const bool outputVTK = true,
-                     const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
-                     const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
-      blocks_( blocks ), filter_( filter ), executionCounter_( uint_c(0) ), checkFrequency_( checkFrequency ),
-      fieldId_( fieldId ), outputToStream_( outputToStream ), outputVTK_( outputVTK ),
-      vtkBaseFolder_( internal::stabilityCheckerVTKBase ),
-      vtkExecutionFolder_( internal::stabilityCheckerVTKFolder ),
-      vtkIdentifier_( internal::stabilityCheckerVTKIdentifier ),
-      vtkBinary_( internal::stabilityCheckerVTKBinary ),
-      vtkLittleEndian_( internal::stabilityCheckerVTKLittleEndian ),
-      vtkMPIIO_( internal::stabilityCheckerVTKMPIIO ),
-      vtkForcePVTU_( internal::stabilityCheckerVTKForcePVTU ),
-      requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors ) {}
-
-   StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
-                     const uint_t checkFrequency,
-                     const bool outputToStream = true, const bool outputVTK = true,
-                     const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
-                     const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
-      blocks_( blocks ), filter_( Filter_T() ), executionCounter_( uint_c(0) ), checkFrequency_( checkFrequency ),
-      fieldId_( fieldId ), outputToStream_( outputToStream ), outputVTK_( outputVTK ),
-      vtkBaseFolder_( internal::stabilityCheckerVTKBase ),
-      vtkExecutionFolder_( internal::stabilityCheckerVTKFolder ),
-      vtkIdentifier_( internal::stabilityCheckerVTKIdentifier ),
-      vtkBinary_( internal::stabilityCheckerVTKBinary ),
-      vtkLittleEndian_( internal::stabilityCheckerVTKLittleEndian ),
-      vtkMPIIO_( internal::stabilityCheckerVTKMPIIO ),
-      vtkForcePVTU_( internal::stabilityCheckerVTKForcePVTU ),
-      requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors )
-   {
-      static_assert( (std::is_same< Filter_T, DefaultEvaluationFilter >::value),
-                     "This constructor is only available if DefaultEvaluationFilter is set as filter type!" );
-   }
+  StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                   const Filter_T & filter, const uint_t checkFrequency,
+                   const bool outputToStream = true, const bool outputVTK = true,
+                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
+   blocks_( blocks ), fieldId_( fieldId ), filter_( filter ), checkFrequency_( checkFrequency ), checkFunction_(internal::stabilityCheckerIsFinite<typename Field_T::value_type>),
+   outputToStream_( outputToStream ), outputVTK_( outputVTK ),
+   requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors )
+  {
+#if defined(WALBERLA_BUILD_WITH_FASTMATH)
+     WALBERLA_LOG_WARNING_ON_ROOT("WaLBerla was build using WALBERLA_BUILD_WITH_FASTMATH. "
+                                  "The default checkFunction of the StabilityChecker checks if NaNs are obtained. "
+                                  "With FASTMATH activated NaNs are not defined and thus the checkFunction will not work. "
+                                  "To make it work provide a different checkFunction.")
+#endif
+  }
+
+  StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                   const Filter_T & filter, const uint_t checkFrequency, CheckFunction_T checkFunction,
+                   const bool outputToStream = true, const bool outputVTK = true,
+                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
+   blocks_( blocks ), fieldId_( fieldId ), filter_( filter ), checkFrequency_( checkFrequency ), checkFunction_(checkFunction),
+   outputToStream_( outputToStream ), outputVTK_( outputVTK ),
+   requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors ){}
+
+
+  StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                   const uint_t checkFrequency,
+                   const bool outputToStream = true, const bool outputVTK = true,
+                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
+   blocks_( blocks ), fieldId_( fieldId ), filter_( Filter_T() ), checkFrequency_( checkFrequency ), checkFunction_(internal::stabilityCheckerIsFinite<typename Field_T::value_type>),
+   outputToStream_( outputToStream ), outputVTK_( outputVTK ),
+   requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors )
+  {
+#if defined(WALBERLA_BUILD_WITH_FASTMATH)
+     WALBERLA_LOG_WARNING_ON_ROOT("WaLBerla was build using WALBERLA_BUILD_WITH_FASTMATH. "
+                                  "The default checkFunction of the StabilityChecker checks if NaNs are obtained. "
+                                  "With FASTMATH activated NaNs are not defined and thus the checkFunction will not work. "
+                                  "To make it work provide a different checkFunction.")
+#endif
+     static_assert( (std::is_same< Filter_T, DefaultEvaluationFilter >::value),
+                   "This constructor is only available if DefaultEvaluationFilter is set as filter type!" );
+  }
+
+  StabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                   const uint_t checkFrequency, CheckFunction_T checkFunction,
+                   const bool outputToStream = true, const bool outputVTK = true,
+                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
+   blocks_( blocks ), fieldId_( fieldId ), filter_( Filter_T() ), checkFrequency_( checkFrequency ), checkFunction_(checkFunction),
+   outputToStream_( outputToStream ), outputVTK_( outputVTK ),
+   requiredSelectors_(requiredSelectors), incompatibleSelectors_( incompatibleSelectors )
+  {
+     static_assert( (std::is_same< Filter_T, DefaultEvaluationFilter >::value),
+                   "This constructor is only available if DefaultEvaluationFilter is set as filter type!" );
+  }
+
    
    void setVTKBaseFolder     ( const std::string & vtkBaseFolder      ) { vtkBaseFolder_      = vtkBaseFolder; }
    void setVTKExecutionFolder( const std::string & vtkExecutionFolder ) { vtkExecutionFolder_ = vtkExecutionFolder; }
@@ -299,30 +334,29 @@ private:
 
    void checkBlock( const IBlock * const block );
 
-
-
    weak_ptr< StructuredBlockStorage > blocks_;
+   ConstBlockDataID fieldId_;
    
    Filter_T filter_;
 
-   uint_t executionCounter_;
+   uint_t executionCounter_{uint_c(0)};
    uint_t checkFrequency_;
 
-   ConstBlockDataID fieldId_;
-
-   BlockCellsMap failedCells_;
+   CheckFunction_T checkFunction_;
 
    bool outputToStream_;
    bool outputVTK_;
-   
-   std::string vtkBaseFolder_;
-   std::string vtkExecutionFolder_;
-   std::string vtkIdentifier_;
 
-   bool vtkBinary_;
-   bool vtkLittleEndian_;
-   bool vtkMPIIO_;
-   bool vtkForcePVTU_;
+   BlockCellsMap failedCells_;
+
+   std::string vtkBaseFolder_{internal::stabilityCheckerVTKBase};
+   std::string vtkExecutionFolder_{internal::stabilityCheckerVTKFolder};
+   std::string vtkIdentifier_{internal::stabilityCheckerVTKIdentifier};
+
+   bool vtkBinary_{internal::stabilityCheckerVTKBinary};
+   bool vtkLittleEndian_{internal::stabilityCheckerVTKLittleEndian};
+   bool vtkMPIIO_{internal::stabilityCheckerVTKMPIIO};
+   bool vtkForcePVTU_{internal::stabilityCheckerVTKForcePVTU};
 
    Set<SUID> requiredSelectors_;
    Set<SUID> incompatibleSelectors_;
@@ -331,19 +365,20 @@ private:
 
 
 
-template< typename Field_T, typename Filter_T >
-void StabilityChecker< Field_T, Filter_T >::operator()()
+template< typename Field_T, typename Filter_T, typename CheckFunction_T >
+void StabilityChecker< Field_T, Filter_T, CheckFunction_T >::operator()()
 {
    ++executionCounter_;
    if( checkFrequency_ == uint_t(0) || ( executionCounter_ - uint_c(1) ) % checkFrequency_ != 0 )
       return;
 
    auto blocks = blocks_.lock();
-   WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'StabilityChecker' for a block storage object that doesn't exist anymore" );
+   WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'StabilityChecker' for a block storage object that doesn't exist anymore" )
 
    for( auto block = blocks->begin( requiredSelectors_, incompatibleSelectors_ ); block != blocks->end(); ++block )
       checkBlock( block.get() );
 
+
    if( outputToStream_ )
    {
       std::ostringstream oss;
@@ -377,7 +412,7 @@ void StabilityChecker< Field_T, Filter_T >::operator()()
       }
 
       if( !failedCells_.empty() )
-         WALBERLA_LOG_WARNING( oss.str() );
+         WALBERLA_LOG_WARNING( oss.str() )
    }
 
    bool abort = !failedCells_.empty();
@@ -401,17 +436,17 @@ void StabilityChecker< Field_T, Filter_T >::operator()()
          vtkWriter->write();
       }
 
-      WALBERLA_LOG_WARNING_ON_ROOT( "Field stability check failed - aborting program ..." );
-      WALBERLA_MPI_WORLD_BARRIER();
+      WALBERLA_LOG_WARNING_ON_ROOT( "Field stability check failed - aborting program ..." )
+      WALBERLA_MPI_WORLD_BARRIER()
 
-      WALBERLA_ABORT_NO_DEBUG_INFO("");
+      WALBERLA_ABORT_NO_DEBUG_INFO("")
    }
 }
 
 
 
-template< typename Field_T, typename Filter_T >
-void StabilityChecker< Field_T, Filter_T >::checkBlock( const IBlock * const block )
+template< typename Field_T, typename Filter_T, typename CheckFunction_T>
+void StabilityChecker< Field_T, Filter_T, CheckFunction_T >::checkBlock( const IBlock * const block )
 {
    const Field_T * field = block->getData< Field_T >(  fieldId_ );
    
@@ -425,7 +460,7 @@ void StabilityChecker< Field_T, Filter_T >::checkBlock( const IBlock * const blo
          {
             for( uint_t f = uint_t(0); f < Field_T::F_SIZE; ++f )
             {
-               if( !internal::stabilityCheckerIsFinite( field->get( x, y, z, cell_idx_c(f) ) ) )
+               if( !checkFunction_( field->get( x, y, z, cell_idx_c(f) ) ) )
                   failedCells_[ block ][ Cell(x,y,z) ].insert( cell_idx_c(f) );
             }
          }
@@ -452,7 +487,7 @@ void StabilityChecker< Field_T, Filter_T >::checkBlock( const IBlock * const blo
                {
                   for( uint_t f = uint_t(0); f < Field_T::F_SIZE; ++f )
                   {
-                     if( !internal::stabilityCheckerIsFinite( field->get( x, y, z, cell_idx_c(f) ) ) )
+                     if( !checkFunction_( field->get( x, y, z, cell_idx_c(f) ) ) )
                      {
                         #pragma omp critical (StabilityChecker)
                         {
@@ -479,7 +514,7 @@ void StabilityChecker< Field_T, Filter_T >::checkBlock( const IBlock * const blo
                {
                   for( uint_t f = uint_t(0); f < Field_T::F_SIZE; ++f )
                   {
-                     if( !internal::stabilityCheckerIsFinite( field->get( x, y, z, cell_idx_c(f) ) ) )
+                     if( !checkFunction_( field->get( x, y, z, cell_idx_c(f) ) ) )
                      {
                         #pragma omp critical (StabilityChecker)
                         {
@@ -498,12 +533,11 @@ void StabilityChecker< Field_T, Filter_T >::checkBlock( const IBlock * const blo
 }
 
 
-
 ///////////////////////////////////////////////////////////////
 // makeStabilityChecker functions without configuration file //
 ///////////////////////////////////////////////////////////////
 
-template< typename Field_T >
+template< typename Field_T>
 shared_ptr< StabilityChecker< Field_T > > makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
                                                                 const uint_t checkFrequency,
                                                                 const bool outputToStream = true, const bool outputVTK = true,
@@ -514,32 +548,70 @@ shared_ptr< StabilityChecker< Field_T > > makeStabilityChecker( const weak_ptr<
    return shared_ptr< SC_T >( new SC_T( blocks, fieldId, checkFrequency, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
 }
 
+
+template< typename Field_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )>>
+shared_ptr< StabilityChecker< Field_T > > makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                                                               const uint_t checkFrequency, CheckFunction_T checkFunction,
+                                                               const bool outputToStream = true, const bool outputVTK = true,
+                                                               const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                                                               const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   using SC_T = StabilityChecker<Field_T>;
+   return shared_ptr< SC_T >( new SC_T( blocks, fieldId, checkFrequency, checkFunction, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
+}
+
 template< typename Field_T, typename FlagField_T >
 shared_ptr< StabilityChecker< Field_T, FlagFieldEvaluationFilter<FlagField_T> > >
-makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks,
-                      const ConstBlockDataID & fieldId, const ConstBlockDataID & flagFieldId, const Set< FlagUID > & cellsToEvaluate,
-                      const uint_t checkFrequency,
-                      const bool outputToStream = true, const bool outputVTK = true,
-                      const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
-                      const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+   makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks,
+                        const ConstBlockDataID & fieldId, const ConstBlockDataID & flagFieldId, const Set< FlagUID > & cellsToEvaluate,
+                        const uint_t checkFrequency,
+                        const bool outputToStream = true, const bool outputVTK = true,
+                        const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
 {
    using SC_T = StabilityChecker<Field_T, FlagFieldEvaluationFilter<FlagField_T>>;
    return shared_ptr< SC_T >( new SC_T( blocks, fieldId, FlagFieldEvaluationFilter<FlagField_T>( flagFieldId, cellsToEvaluate ),
-                                        checkFrequency, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
+                                      checkFrequency, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
+}
+
+template< typename Field_T, typename FlagField_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )>>
+shared_ptr< StabilityChecker< Field_T, FlagFieldEvaluationFilter<FlagField_T> > >
+   makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks,
+                        const ConstBlockDataID & fieldId, const ConstBlockDataID & flagFieldId, const Set< FlagUID > & cellsToEvaluate,
+                        const uint_t checkFrequency, CheckFunction_T checkFunction,
+                        const bool outputToStream = true, const bool outputVTK = true,
+                        const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   using SC_T = StabilityChecker<Field_T, FlagFieldEvaluationFilter<FlagField_T>>;
+   return shared_ptr< SC_T >( new SC_T( blocks, fieldId, FlagFieldEvaluationFilter<FlagField_T>( flagFieldId, cellsToEvaluate ),
+                                      checkFrequency, checkFunction, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
 }
 
 template< typename Field_T, typename Filter_T >
 shared_ptr< StabilityChecker< Field_T, Filter_T > >
-makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
-                      const Filter_T & filter, const uint_t checkFrequency,
-                      const bool outputToStream = true, const bool outputVTK = true,
-                      const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
-                      const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+   makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                        const Filter_T & filter, const uint_t checkFrequency,
+                        const bool outputToStream = true, const bool outputVTK = true,
+                        const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
 {
    using SC_T = StabilityChecker<Field_T, Filter_T>;
    return shared_ptr< SC_T >( new SC_T( blocks, fieldId, filter, checkFrequency, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
 }
 
+template< typename Field_T, typename Filter_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )>>
+shared_ptr< StabilityChecker< Field_T, Filter_T > >
+   makeStabilityChecker( const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                        const Filter_T & filter, const uint_t checkFrequency, CheckFunction_T checkFunction,
+                        const bool outputToStream = true, const bool outputVTK = true,
+                        const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   using SC_T = StabilityChecker<Field_T, Filter_T>;
+   return shared_ptr< SC_T >( new SC_T( blocks, fieldId, filter, checkFrequency, checkFunction, outputToStream, outputVTK, requiredSelectors, incompatibleSelectors ) );
+}
+
 
 
 ///////////////////////////////////////////////////////////
@@ -577,7 +649,7 @@ inline void stabilityCheckerConfigParser( const shared_ptr< Config > & config, c
                                           std::string & defaultVTKBaseFolder, std::string & defaultVTKExecutionFolder, std::string & defaultVTKIdentifier,
                                           bool & defaultVTKBinary, bool & defaultVTKLittleEndian, bool & defaultVTKMPIIO, bool & defaultVTKForcePVTU )
 {
-   if( !!config )
+   if(config)
       stabilityCheckerConfigParser( config->getGlobalBlock(), configBlockName, defaultCheckFrequency, defaultOutputToStream, defaultOutputVTK,
                                     defaultVTKBaseFolder, defaultVTKExecutionFolder, defaultVTKIdentifier,
                                     defaultVTKBinary, defaultVTKLittleEndian, defaultVTKMPIIO, defaultVTKForcePVTU );
@@ -623,6 +695,20 @@ shared_ptr< StabilityChecker< Field_T > > makeStabilityChecker( const Config_T &
    WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
 }
 
+template< typename Field_T, typename Config_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )> > // Config_T may be 'shared_ptr< Config >' or 'Config::BlockHandle'
+shared_ptr< StabilityChecker< Field_T > > makeStabilityChecker( const Config_T & config,
+                                                               const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId,
+                                                               CheckFunction_T checkFunction,
+                                                               const std::string & configBlockName = internal::stabilityCheckerConfigBlock,
+                                                               const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                                                               const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_CONFIG_PARSER( config )
+   using SC_T = StabilityChecker<Field_T>;
+   auto checker = shared_ptr< SC_T >( new SC_T( blocks, fieldId, defaultCheckFrequency, checkFunction, defaultOutputToStream, defaultOutputVTK, requiredSelectors, incompatibleSelectors ) );
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
+}
+
 template< typename Field_T, typename FlagField_T, typename Config_T > // Config_T may be 'shared_ptr< Config >' or 'Config::BlockHandle'
 shared_ptr< StabilityChecker< Field_T, FlagFieldEvaluationFilter<FlagField_T> > >
 makeStabilityChecker( const Config_T & config,
@@ -639,6 +725,23 @@ makeStabilityChecker( const Config_T & config,
    WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
 }
 
+template< typename Field_T, typename FlagField_T, typename Config_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )> > // Config_T may be 'shared_ptr< Config >' or 'Config::BlockHandle'
+shared_ptr< StabilityChecker< Field_T, FlagFieldEvaluationFilter<FlagField_T> > >
+   makeStabilityChecker( const Config_T & config,
+                        const weak_ptr< StructuredBlockStorage > & blocks,
+                        const ConstBlockDataID & fieldId, const ConstBlockDataID & flagFieldId, const Set< FlagUID > & cellsToEvaluate,
+                        CheckFunction_T checkFunction,
+                        const std::string & configBlockName = internal::stabilityCheckerConfigBlock,
+                        const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_CONFIG_PARSER( config )
+   using SC_T = StabilityChecker<Field_T, FlagFieldEvaluationFilter<FlagField_T>>;
+   auto checker = shared_ptr< SC_T >( new SC_T( blocks, fieldId, FlagFieldEvaluationFilter<FlagField_T>( flagFieldId, cellsToEvaluate ),
+                                              defaultCheckFrequency, checkFunction, defaultOutputToStream, defaultOutputVTK, requiredSelectors, incompatibleSelectors ) );
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
+}
+
 template< typename Field_T, typename Filter_T, typename Config_T > // Config_T may be 'shared_ptr< Config >' or 'Config::BlockHandle'
 shared_ptr< StabilityChecker< Field_T, Filter_T > >
 makeStabilityChecker( const Config_T & config,
@@ -654,6 +757,22 @@ makeStabilityChecker( const Config_T & config,
    WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
 }
 
+template< typename Field_T, typename Filter_T, typename Config_T, typename CheckFunction_T = std::function<bool ( const typename Field_T::value_type & value )> > // Config_T may be 'shared_ptr< Config >' or 'Config::BlockHandle'
+shared_ptr< StabilityChecker< Field_T, Filter_T > >
+   makeStabilityChecker( const Config_T & config,
+                        const weak_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & fieldId, const Filter_T & filter,
+                        CheckFunction_T checkFunction,
+                        const std::string & configBlockName = internal::stabilityCheckerConfigBlock,
+                        const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                        const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+{
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_CONFIG_PARSER( config )
+   using SC_T = StabilityChecker<Field_T, Filter_T>;
+   auto checker = shared_ptr< SC_T >( new SC_T( blocks, fieldId, filter, defaultCheckFrequency, checkFunction, defaultOutputToStream, defaultOutputVTK,
+                                              requiredSelectors, incompatibleSelectors ) );
+   WALBERLA_FIELD_MAKE_STABILITY_CHECKER_SET_AND_RETURN()
+}
+
 
 
 #undef WALBERLA_FIELD_MAKE_STABILITY_CHECKER_CONFIG_PARSER
diff --git a/src/field/blockforest/BlockDataHandling.h b/src/field/blockforest/BlockDataHandling.h
index a156e08b787a15b35797f6942720d93f625ae928..5113b895a881e3c7ba29c4f541c4801a4d6f6dfe 100644
--- a/src/field/blockforest/BlockDataHandling.h
+++ b/src/field/blockforest/BlockDataHandling.h
@@ -92,12 +92,12 @@ protected:
 
    void sizeCheck( const uint_t xSize, const uint_t ySize, const uint_t zSize )
    {
-      WALBERLA_CHECK( (xSize & uint_t(1)) == uint_t(0), "The x-size of your field must be divisible by 2." );
-      WALBERLA_CHECK( (ySize & uint_t(1)) == uint_t(0), "The y-size of your field must be divisible by 2." );
+      WALBERLA_CHECK( (xSize & uint_t(1)) == uint_t(0), "The x-size of your field must be divisible by 2." )
+      WALBERLA_CHECK( (ySize & uint_t(1)) == uint_t(0), "The y-size of your field must be divisible by 2." )
       if( Pseudo2D )
-      { WALBERLA_CHECK( zSize == uint_t(1), "The z-size of your field must be equal to 1 (pseudo 2D mode)." ); }
+      { WALBERLA_CHECK( zSize == uint_t(1), "The z-size of your field must be equal to 1 (pseudo 2D mode)." ) }
       else
-      { WALBERLA_CHECK( (zSize & uint_t(1)) == uint_t(0), "The z-size of your field must be divisible by 2." ); }
+      { WALBERLA_CHECK( (zSize & uint_t(1)) == uint_t(0), "The z-size of your field must be divisible by 2." ) }
    }
    
    InitializationFunction_T initFunction_;
@@ -110,7 +110,7 @@ template< typename Field_T, bool Pseudo2D >
 inline void BlockDataHandling< Field_T, Pseudo2D >::serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer )
 {
    Field_T * field = block->template getData< Field_T >(id);
-   WALBERLA_ASSERT_NOT_NULLPTR( field );
+   WALBERLA_ASSERT_NOT_NULLPTR( field )
 
 #ifndef NDEBUG
    buffer << field->xSize() << field->ySize() << field->zSize() << field->fSize();
@@ -126,7 +126,7 @@ template< typename Field_T, bool Pseudo2D >
 void BlockDataHandling< Field_T, Pseudo2D >::serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child )
 {
    Field_T * field = block->template getData< Field_T >(id);
-   WALBERLA_ASSERT_NOT_NULLPTR( field );
+   WALBERLA_ASSERT_NOT_NULLPTR( field )
 
    const uint_t xSize = field->xSize();
    const uint_t ySize = field->ySize();
@@ -161,7 +161,7 @@ template< typename Field_T, bool Pseudo2D >
 void BlockDataHandling< Field_T, Pseudo2D >::serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer )
 {
    Field_T * field = block->template getData< Field_T >(id);
-   WALBERLA_ASSERT_NOT_NULLPTR( field );
+   WALBERLA_ASSERT_NOT_NULLPTR( field )
 
    const uint_t xSize = field->xSize();
    const uint_t ySize = field->ySize();
@@ -210,10 +210,10 @@ inline void BlockDataHandling< Field_T, Pseudo2D >::deserialize( IBlock * const
    uint_t zSender( uint_t(0) );
    uint_t fSender( uint_t(0) );
    buffer >> xSender >> ySender >> zSender >> fSender;
-   WALBERLA_ASSERT_EQUAL( xSender, field->xSize() );
-   WALBERLA_ASSERT_EQUAL( ySender, field->ySize() );
-   WALBERLA_ASSERT_EQUAL( zSender, field->zSize() );
-   WALBERLA_ASSERT_EQUAL( fSender, field->fSize() );
+   WALBERLA_ASSERT_EQUAL( xSender, field->xSize() )
+   WALBERLA_ASSERT_EQUAL( ySender, field->ySize() )
+   WALBERLA_ASSERT_EQUAL( zSender, field->zSize() )
+   WALBERLA_ASSERT_EQUAL( fSender, field->fSize() )
 #endif
 
    for( auto it = field->begin(); it != field->end(); ++it )
@@ -240,14 +240,14 @@ void BlockDataHandling< Field_T, Pseudo2D >::deserializeCoarseToFine( Block * co
    uint_t zSender( uint_t(0) );
    uint_t fSender( uint_t(0) );
    buffer >> branchId >> xSender >> ySender >> zSender >> fSender;
-   WALBERLA_ASSERT_EQUAL( branchId, block->getId().getBranchId() );
-   WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) );
-   WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) );
+   WALBERLA_ASSERT_EQUAL( branchId, block->getId().getBranchId() )
+   WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) )
+   WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) )
    if( Pseudo2D )
-   { WALBERLA_ASSERT_EQUAL( zSender, zSize ); }
+   { WALBERLA_ASSERT_EQUAL( zSender, zSize ) }
    else
-   { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ); }
-   WALBERLA_ASSERT_EQUAL( fSender, fSize );
+   { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ) }
+   WALBERLA_ASSERT_EQUAL( fSender, fSize )
 #endif
 
    for( cell_idx_t z = cell_idx_t(0); z < cell_idx_c( zSize ); z += cell_idx_t(2) ) {
@@ -295,14 +295,14 @@ void BlockDataHandling< Field_T, Pseudo2D >::deserializeFineToCoarse( Block * co
    uint_t zSender( uint_t(0) );
    uint_t fSender( uint_t(0) );
    buffer >> branchId >> xSender >> ySender >> zSender >> fSender;
-   WALBERLA_ASSERT_EQUAL( branchId, child );
-   WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) );
-   WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) );
+   WALBERLA_ASSERT_EQUAL( branchId, child )
+   WALBERLA_ASSERT_EQUAL( xSender, xSize / uint_t(2) )
+   WALBERLA_ASSERT_EQUAL( ySender, ySize / uint_t(2) )
    if( Pseudo2D )
-   { WALBERLA_ASSERT_EQUAL( zSender, zSize ); }
+   { WALBERLA_ASSERT_EQUAL( zSender, zSize ) }
    else
-   { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ); }
-   WALBERLA_ASSERT_EQUAL( fSender, fSize );
+   { WALBERLA_ASSERT_EQUAL( zSender, zSize / uint_t(2) ) }
+   WALBERLA_ASSERT_EQUAL( fSender, fSize )
 #endif
 
    const cell_idx_t zBegin = Pseudo2D ? cell_idx_t(0) : ( (child & uint_t(4)) ? ( cell_idx_c( zSize ) / cell_idx_t(2) ) : cell_idx_t(0) );
@@ -413,17 +413,17 @@ public:
    DefaultBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks,
                              const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize = internal::defaultSize,
                              const shared_ptr< field::FieldAllocator<Value_T> > alloc = nullptr) :
-      blocks_( blocks ), nrOfGhostLayers_( uint_t(1) ), initValue_(), layout_( zyxf ), calculateSize_( calculateSize ), alloc_(alloc)
+      blocks_( blocks ), nrOfGhostLayers_( uint_t(1) ), initValue_(), layout_( fzyx ), calculateSize_( calculateSize ), alloc_(alloc)
    {}
 
    DefaultBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks, const uint_t nrOfGhostLayers,
                              const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize = internal::defaultSize,
                              const shared_ptr< field::FieldAllocator<Value_T> > alloc = nullptr) :
-      blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_(), layout_( zyxf ), calculateSize_( calculateSize ), alloc_(alloc)
+      blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_(), layout_( fzyx ), calculateSize_( calculateSize ), alloc_(alloc)
    {}
 
    DefaultBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks, const uint_t nrOfGhostLayers,
-                             const Value_T & initValue, const Layout layout = zyxf,
+                             const Value_T & initValue, const Layout layout = fzyx,
                              const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize = internal::defaultSize,
                              const shared_ptr< field::FieldAllocator<Value_T> > alloc = nullptr) :
       blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_( initValue ), layout_( layout ), calculateSize_( calculateSize ), alloc_(alloc)
@@ -437,7 +437,7 @@ protected:
    GhostLayerField_T * allocate( IBlock * const block ) override
    {
       auto blocks = blocks_.lock();
-      WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'DefaultBlockDataHandling' for a block storage object that doesn't exist anymore" );
+      WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'DefaultBlockDataHandling' for a block storage object that doesn't exist anymore" )
       const Vector3< uint_t > size = calculateSize_( blocks, block );
       return internal::allocate< GhostLayerField_T >( size[0], size[1], size[2],
                                                       nrOfGhostLayers_, initValue_, layout_, alloc_ );
@@ -480,13 +480,13 @@ public:
    AlwaysInitializeBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks,
                                       const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize = internal::defaultSize,
                                       const shared_ptr< field::FieldAllocator<Value_T> > alloc = nullptr) :
-      blocks_( blocks ), nrOfGhostLayers_( uint_t(1) ), initValue_(), layout_( zyxf ), calculateSize_( calculateSize ), alloc_(alloc)
+      blocks_( blocks ), nrOfGhostLayers_( uint_t(1) ), initValue_(), layout_( fzyx ), calculateSize_( calculateSize ), alloc_(alloc)
    {}
 
    AlwaysInitializeBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks, const uint_t nrOfGhostLayers,
                                       const std::function< Vector3< uint_t > ( const shared_ptr< StructuredBlockStorage > &, IBlock * const ) >& calculateSize = internal::defaultSize,
                                       const shared_ptr< field::FieldAllocator<Value_T> > alloc = nullptr) :
-      blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_(), layout_( zyxf ), calculateSize_( calculateSize ), alloc_(alloc)
+      blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_(), layout_( fzyx ), calculateSize_( calculateSize ), alloc_(alloc)
    {}
 
    AlwaysInitializeBlockDataHandling( const weak_ptr< StructuredBlockStorage > & blocks, const uint_t nrOfGhostLayers,
diff --git a/src/field/communication/StencilRestrictedPackInfo.h b/src/field/communication/StencilRestrictedPackInfo.h
index b82050340a359f0d8c91bb96343a4f0198116c7d..3ad7f9da2fcf231d2548b4dd2bdafd060269bec8 100644
--- a/src/field/communication/StencilRestrictedPackInfo.h
+++ b/src/field/communication/StencilRestrictedPackInfo.h
@@ -73,10 +73,10 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::unpackData( IBlock *
       return;
 
    GhostLayerField_T * pdfField = receiver->getData< GhostLayerField_T >( fieldId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR( pdfField );
-   WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 );
+   WALBERLA_ASSERT_NOT_NULLPTR( pdfField )
+   WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 )
 
-   stencil::Direction packerDirection = stencil::inverseDir[dir];
+   stencil::Direction const packerDirection = stencil::inverseDir[dir];
 
    for(auto i = pdfField->beginGhostLayerOnlyXYZ(dir); i != pdfField->end(); ++i )
       for(uint_t f = 0; f < Stencil::d_per_d_length[packerDirection]; ++f)
@@ -94,7 +94,7 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::communicateLocal( co
    const GhostLayerField_T * sf = sender  ->getData< GhostLayerField_T >( fieldId_ );
          GhostLayerField_T * rf = receiver->getData< GhostLayerField_T >( fieldId_ );
 
-   WALBERLA_ASSERT_EQUAL( sf->xyzSize(), rf->xyzSize() );
+   WALBERLA_ASSERT_EQUAL( sf->xyzSize(), rf->xyzSize() )
 
    typename GhostLayerField_T::const_iterator srcIter = sf->beginSliceBeforeGhostLayerXYZ(dir);
    typename GhostLayerField_T::iterator       dstIter = rf->beginGhostLayerOnlyXYZ(stencil::inverseDir[dir]);
@@ -107,8 +107,8 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::communicateLocal( co
       ++srcIter;
       ++dstIter;
    }
-   WALBERLA_ASSERT( srcIter == sf->end() );
-   WALBERLA_ASSERT( dstIter == rf->end() );
+   WALBERLA_ASSERT( srcIter == sf->end() )
+   WALBERLA_ASSERT( dstIter == rf->end() )
 }
 
 
@@ -120,8 +120,8 @@ void StencilRestrictedPackInfo<GhostLayerField_T, Stencil>::packDataImpl( const
       return;
 
    const GhostLayerField_T * pdfField = sender->getData< GhostLayerField_T >( fieldId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR( pdfField );
-   WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 );
+   WALBERLA_ASSERT_NOT_NULLPTR( pdfField )
+   WALBERLA_ASSERT_EQUAL( pdfField->nrOfGhostLayers(), 1 )
 
    for( auto i = pdfField->beginSliceBeforeGhostLayerXYZ(dir); i != pdfField->end(); ++i )
       for(uint_t f = 0; f < Stencil::d_per_d_length[dir]; ++f)
diff --git a/src/geometry/InitBoundaryHandling.h b/src/geometry/InitBoundaryHandling.h
index fe6817d3ead08c2b7bb144f9564dd8ea8d03809d..bcea3de9305e52f37a43cb11dec6f8ea73e6ff49 100644
--- a/src/geometry/InitBoundaryHandling.h
+++ b/src/geometry/InitBoundaryHandling.h
@@ -141,6 +141,21 @@ void setNonBoundaryCellsToDomain( StructuredBlockStorage & blocks, BlockDataID f
    }
 }
 
+template<typename FlagField_T>
+void setNonBoundaryCellsToDomain( StructuredBlockStorage & blocks, BlockDataID flagFieldID,
+                                 field::FlagUID fluidFlagID, cell_idx_t numGhostLayers)
+{
+   for( auto blockIt = blocks.begin(); blockIt != blocks.end(); ++blockIt )
+   {
+      auto flagField = blockIt->template getData<FlagField_T>( flagFieldID );
+      auto fluidFlag = flagField->getOrRegisterFlag(fluidFlagID);
+      for( auto it = flagField->beginWithGhostLayerXYZ(numGhostLayers); it != flagField->end(); ++it )
+         if ( *it == 0 )
+            addFlag(it, fluidFlag);
+   }
+}
+
+
 
 } // namespace geometry
 } // namespace walberla
diff --git a/src/geometry/initializer/BoundarySetterFlagFieldSpecialization.h b/src/geometry/initializer/BoundarySetterFlagFieldSpecialization.h
index 8e3ee37ee0f2b342a12e7a11e3c5f123be50f4c6..e154678f0afb0392184d144a8ebe98389cf6bde4 100644
--- a/src/geometry/initializer/BoundarySetterFlagFieldSpecialization.h
+++ b/src/geometry/initializer/BoundarySetterFlagFieldSpecialization.h
@@ -136,22 +136,30 @@ namespace initializer {
    template<typename Flag_T>
    void BoundarySetter<FlagField<Flag_T>>::set( cell_idx_t x, cell_idx_t y, cell_idx_t z )
    {
-      flagField_->addFlag( x, y, z, flag_ );
+      //Check if no flag is set yet to avoid multiple flags per cell on initialization
+      if(flagField_->get(x,y,z) == Flag_T(0))
+         flagField_->addFlag( x, y, z, flag_ );
    }
 
    template<typename Flag_T>
    void BoundarySetter<FlagField<Flag_T>>::set( const CellInterval & ci )
    {
-      for( auto it = flagField_->beginSliceXYZ(ci); it != flagField_->end(); ++it )
-         field::addFlag(it, flag_);
+      for( auto it = flagField_->beginSliceXYZ(ci); it != flagField_->end(); ++it ) {
+         //Check if no flag is set yet to avoid multiple flags per cell on initialization
+         if(*it == Flag_T(0))
+            field::addFlag(it, flag_);
+      }
    }
 
    template<typename Flag_T>
    template< typename CellIterator >
    void BoundarySetter<FlagField<Flag_T> >::set( const CellIterator & begin, const CellIterator & end )
    {
-      for(auto it = begin; it != end; ++it)
-         flagField_->addFlag(it->x(), it->y(), it->z(), flag_);
+      for(auto it = begin; it != end; ++it) {
+         //Check if no flag is set yet to avoid multiple flags per cell on initialization
+         if(flagField_->get(it->x(),it->y(),it->z()) == Flag_T(0))
+            flagField_->addFlag(it->x(), it->y(), it->z(), flag_);
+      }
    }
 
 } // namespace initializer
diff --git a/src/cuda/AddGPUFieldToStorage.h b/src/gpu/AddGPUFieldToStorage.h
similarity index 91%
rename from src/cuda/AddGPUFieldToStorage.h
rename to src/gpu/AddGPUFieldToStorage.h
index 3803ff8c814bb4df9e21312cbd00bee562100492..09736afad0e34e8c1cbd36db9590925e2bff3443 100644
--- a/src/cuda/AddGPUFieldToStorage.h
+++ b/src/gpu/AddGPUFieldToStorage.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AddGPUFieldToStorage.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,17 +22,17 @@
 #pragma once
 
 #include "GPUField.h"
-
 #include "domain_decomposition/StructuredBlockStorage.h"
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
    //*******************************************************************************************************************
-   /*! Adds a cuda::GPUField to a StructuredBlockStorage
+   /*! Adds a gpu::GPUField to a StructuredBlockStorage
    *
    *  - Similar to walberla::field::addToStorage() functions
    *  - created field is uninitialized
@@ -49,12 +49,12 @@ namespace cuda {
 
 
    //*******************************************************************************************************************
-   /*! Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field
+   /*! Adds a gpu::GPUField to a StructuredBlockStorage using data from a CPU field
    *
    *  - adds a GPU field to a StructuredBlockStorage using a CPU field
    *  - sizes, number of ghostlayers and layout are the same as the CPU field
    *  - GPU field is initialized with the data currently stored in the CPU field
-   *  @tparam Field_T  type of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type>
+   *  @tparam Field_T  type of the CPU field, the created GPUField will be of type gpu::GPUField<Field_T::value_type>
    */
    //*******************************************************************************************************************
    template< typename Field_T>
@@ -65,7 +65,7 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/AddGPUFieldToStorage.impl.h b/src/gpu/AddGPUFieldToStorage.impl.h
similarity index 96%
rename from src/cuda/AddGPUFieldToStorage.impl.h
rename to src/gpu/AddGPUFieldToStorage.impl.h
index 1befc3e81bc4a04fd89c1ec9561e1e417023cc05..610b853265cf7b94ad5be81bb1bd9444ce2b008b 100644
--- a/src/cuda/AddGPUFieldToStorage.impl.h
+++ b/src/gpu/AddGPUFieldToStorage.impl.h
@@ -14,17 +14,18 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AddGPUFieldToStorage.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "cuda/FieldCopy.h"
+#include "gpu/FieldCopy.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    namespace internal
@@ -57,7 +58,7 @@ namespace cuda {
          auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(),
                                          f->nrOfGhostLayers(), f->layout(), usePitchedMem );
 
-         cuda::fieldCpy( *gpuField, *f );
+         gpu::fieldCpy( *gpuField, *f );
 
          return gpuField;
       }
@@ -90,7 +91,5 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
-
-
diff --git a/src/cuda/AlignedAllocation.cpp b/src/gpu/AlignedAllocation.cpp
similarity index 71%
rename from src/cuda/AlignedAllocation.cpp
rename to src/gpu/AlignedAllocation.cpp
index db6ba38509779827c942d1242c569e87cf73440b..65e58c79af1f9b3809547b85ab53789ac391907f 100644
--- a/src/cuda/AlignedAllocation.cpp
+++ b/src/gpu/AlignedAllocation.cpp
@@ -14,47 +14,53 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AlignedAllocation.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #include "AlignedAllocation.h"
-#include "cuda/ErrorChecking.h"
+#include "gpu/DeviceWrapper.h"
+#include "gpu/ErrorChecking.h"
 #include "core/debug/CheckFunctions.h"
 #include "core/debug/Debug.h"
-#include "core/logging/Logging.h"
 
 #include <map>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    static std::map<void *, void*> freePointers_;
 
    void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
    {
+      WALBERLA_NON_DEVICE_SECTION()
+      {
+         WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+      }
+
       // With 0 alignment this function makes no sense
       // use normal malloc instead
-      WALBERLA_ASSERT_GREATER( alignment, 0 );
+      WALBERLA_ASSERT_GREATER( alignment, 0 )
       // Tests if alignment is power of two (assuming alignment>0)
-      WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+      WALBERLA_ASSERT( !(alignment & (alignment - 1)) )
 
-      WALBERLA_ASSERT_LESS( offset, alignment );
+      WALBERLA_ASSERT_LESS( offset, alignment )
 
       if( offset == 0 )
       {
          void * result = nullptr;
-         WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
+         WALBERLA_GPU_CHECK( gpuMalloc( &result, size ) )
          freePointers_[result] = result;
          return result;
       }
 
-      void *pa;  // pointer to allocated memory
-      void *ptr; // pointer to usable aligned memory
+      void *pa = nullptr;   // pointer to allocated memory
+      void *ptr = nullptr;  // pointer to usable aligned memory
 
-      WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
-      WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
+      WALBERLA_GPU_CHECK( gpuMalloc( &pa, size + alignment ));
+      WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "GPU malloc did not return memory with requested alignment");
       ptr = (void *) ((char *) (pa) + alignment - offset);
       freePointers_[ptr] = pa;
 
@@ -65,9 +71,14 @@ namespace cuda {
 
    void free_aligned_with_offset( void *ptr )
    {
+      WALBERLA_NON_DEVICE_SECTION()
+      {
+         WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+      }
+
       // assume that pointer to real allocated chunk is stored just before
       // chunk that was given to user
-      WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
+      WALBERLA_GPU_CHECK( gpuFree( freePointers_[ptr] ));
       freePointers_.erase(ptr);
    }
 
@@ -87,6 +98,6 @@ namespace cuda {
    }
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
diff --git a/src/cuda/AlignedAllocation.h b/src/gpu/AlignedAllocation.h
similarity index 96%
rename from src/cuda/AlignedAllocation.h
rename to src/gpu/AlignedAllocation.h
index 6dfb45624cea7b2fd15de2e0dfed3c04a5993112..1e935b174c5514203c7618bafb27bd2b4df208e1 100644
--- a/src/cuda/AlignedAllocation.h
+++ b/src/gpu/AlignedAllocation.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file AlignedAllocation.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -24,7 +24,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset );
 
@@ -35,5 +36,5 @@ namespace cuda {
    void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
                                        size_t alignment, size_t alignmentOffset );
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/CMakeLists.txt b/src/gpu/CMakeLists.txt
similarity index 71%
rename from src/cuda/CMakeLists.txt
rename to src/gpu/CMakeLists.txt
index bfefb9dcc680ac5506f2113108e7283efd8c579d..fb6810d4e9241c476967bd430a9440b7b6eee86f 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -1,24 +1,26 @@
 ###################################################################################################
 #
-# Module cuda
+# Module gpu
 #
 ###################################################################################################
 
-add_library( cuda )
-target_link_libraries( cuda PUBLIC blockforest core communication domain_decomposition executiontree field stencil lbm )
-target_sources( cuda
+add_library( gpu )
+target_link_libraries( gpu PUBLIC blockforest core communication domain_decomposition executiontree field stencil lbm )
+
+# sources for HIP and CUDA
+target_sources( gpu
       PRIVATE
       AlignedAllocation.h
       AddGPUFieldToStorage.h
       ErrorChecking.h
-      ExecutionTreeGPU.h
       FieldCopy.h
       GPUCopy.cpp
-      NVTX.h
       FieldIndexingXYZ.h
       FieldIndexing3D.h
       AddGPUFieldToStorage.impl.h
       GPUField.h
+      GPUWrapper.h
+      DeviceWrapper.h
       FieldAccessor3D.h
       DeviceSelectMPI.h
       HostFieldAllocator.h
@@ -27,7 +29,6 @@ target_sources( cuda
       GPUCopy.h
       FieldAccessorXYZ.h
       FieldIndexingXYZ.impl.h
-      ExecutionTreeSweepGPU.h
       FieldIndexing.h
       AlignedAllocation.cpp
       GPUField.impl.h
@@ -35,10 +36,18 @@ target_sources( cuda
       FieldIndexing.impl.h
       Kernel.h
       ParallelStreams.h
-      CudaRAII.h
+      GPURAII.h
       DeviceSelectMPI.cpp
       )
 
+# sources only for CUDA
+if (WALBERLA_BUILD_WITH_CUDA)
+target_sources( gpu
+        PRIVATE
+        NVTX.h
+        )
+endif (WALBERLA_BUILD_WITH_CUDA)
+
 add_subdirectory( sweeps )
 add_subdirectory( communication )
 add_subdirectory( lbm )
diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/gpu/DeviceSelectMPI.cpp
similarity index 50%
rename from src/cuda/DeviceSelectMPI.cpp
rename to src/gpu/DeviceSelectMPI.cpp
index 3ba255d9f6fd926721477158243022c0012611ed..81b87b3de2965b1fa6eedd045c895cb749c6263a 100644
--- a/src/cuda/DeviceSelectMPI.cpp
+++ b/src/gpu/DeviceSelectMPI.cpp
@@ -14,18 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file DeviceSelectMPI.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #include "DeviceSelectMPI.h"
 #include "core/mpi/MPIWrapper.h"
-#include "cuda/ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 #if MPI_VERSION == 2 || MPI_VERSION == 1
 
@@ -37,42 +38,45 @@ void selectDeviceBasedOnMpiRank() {
 
 void selectDeviceBasedOnMpiRank()
 {
-#ifdef WALBERLA_BUILD_WITH_MPI
-   int deviceCount;
-   WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount ));
-   WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" );
 
-   MPI_Info info;
-   MPI_Info_create( &info );
-   MPI_Comm newCommunicator;
-   MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator );
+   WALBERLA_DEVICE_SECTION()
+   {
+      WALBERLA_MPI_SECTION()
+      {
+         int deviceCount;
+         WALBERLA_GPU_CHECK(gpuGetDeviceCount(&deviceCount))
+         WALBERLA_LOG_INFO_ON_ROOT("Selecting device depending on MPI Rank")
 
-   int processesOnNode;
-   int rankOnNode;
-   MPI_Comm_size( newCommunicator, &processesOnNode );
-   MPI_Comm_rank( newCommunicator, &rankOnNode );
+         MPI_Info info;
+         MPI_Info_create(&info);
+         MPI_Comm newCommunicator;
+         MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator);
 
-   if ( deviceCount == processesOnNode )
-   {
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
-   }
-   else if ( deviceCount > processesOnNode )
-   {
-      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: "
-                               << processesOnNode << ", available GPUs on node: " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
-   }
-   else
-   {
-      WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node "
-                               << processesOnNode << ", available GPUs on node " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ));
+         int processesOnNode;
+         int rankOnNode;
+         MPI_Comm_size(newCommunicator, &processesOnNode);
+         MPI_Comm_rank(newCommunicator, &rankOnNode);
+
+         if (deviceCount == processesOnNode) { WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode)) }
+         else if (deviceCount > processesOnNode)
+         {
+            WALBERLA_LOG_WARNING("Not using all available GPUs on node. Processes on node: "
+                                 << processesOnNode << ", available GPUs on node: " << deviceCount)
+            WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode))
+         }
+         else
+         {
+            WALBERLA_LOG_WARNING(
+               "Too many processes started per node - should be one per GPU. Number of processes per node "
+               << processesOnNode << ", available GPUs on node " << deviceCount)
+            WALBERLA_GPU_CHECK(gpuSetDevice(rankOnNode % deviceCount))
+         }
+      }
    }
-#endif
 }
 
 #endif
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/DeviceSelectMPI.h b/src/gpu/DeviceSelectMPI.h
similarity index 92%
rename from src/cuda/DeviceSelectMPI.h
rename to src/gpu/DeviceSelectMPI.h
index 06d4296726115a896e763e90feddbd0254aece30..5ed18edf509f966cdfd95d3b163006f05bbc2bd0 100644
--- a/src/cuda/DeviceSelectMPI.h
+++ b/src/gpu/DeviceSelectMPI.h
@@ -14,20 +14,22 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file DeviceSelectMPI.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 /**
- * Selects active CUDA device based on MPI rank
+ * Selects active GPU device based on MPI rank
  *
  * assumes that on each node there are as many MPI processes started as there are GPUs
  * - if there are more GPUs than processes on a node, a warning is printed and not all GPUs are utilized
@@ -36,5 +38,5 @@ namespace cuda {
  */
 void selectDeviceBasedOnMpiRank();
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/gpu/DeviceWrapper.h b/src/gpu/DeviceWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..64590bd4a875bf451971f458da433b09ec5cb638
--- /dev/null
+++ b/src/gpu/DeviceWrapper.h
@@ -0,0 +1,292 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file DeviceWrapper.h
+//! \ingroup gpu
+//! \author Richard Angersbach <richard.angersbach@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+/// \cond internal
+
+#include <sstream>
+#include "core/Abort.h"
+
+// CMake generated header
+#include "waLBerlaDefinitions.h"
+
+// DEVICE SECTION //
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+
+#define WALBERLA_DEVICE_SECTION() if (true)
+#define WALBERLA_NON_DEVICE_SECTION() if (false)
+
+#else
+
+#define WALBERLA_DEVICE_SECTION() if (false)
+#define WALBERLA_NON_DEVICE_SECTION() if (true)
+
+#endif
+
+namespace walberla {
+namespace gpustubs {
+   // empty namespace which can be used
+} // namespace gpustubs
+} // namespace walberla
+
+#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
+
+// include runtime header
+#include "gpu/GPUWrapper.h"
+
+#else // WALBERLA_BUILD_WITH_GPU_SUPPORT
+
+namespace walberla {
+namespace gpustubs {
+
+// dummy definitions for CUDA/HIP data types and functions in order to guarantee successful compilation without CUDA/HIP enabled
+
+#define WALBERLA_DEVICE_FUNCTION_ERROR \
+   WALBERLA_ABORT("Invalid device function call! In case of compiling without CUDA/HIP, functions are not " \
+                  "available and shouldn't be called!");
+
+#ifndef __CUDACC__
+   #define __device__
+   #define __global__
+   #define __host__
+   #define __forceinline__
+#endif
+
+using gpuError_t = int;
+const gpuError_t gpuSuccess = 0;
+
+#define gpuHostAllocDefault 0x00
+#define gpuHostAllocMapped 0x02
+#define gpuHostAllocPortable 0x01
+#define gpuHostAllocWriteCombined 0x04
+
+using gpuMemcpyKind                          = int;
+const gpuMemcpyKind gpuMemcpyHostToHost     = 0;
+const gpuMemcpyKind gpuMemcpyHostToDevice   = 1;
+const gpuMemcpyKind gpuMemcpyDeviceToHost   = 2;
+const gpuMemcpyKind gpuMemcpyDeviceToDevice = 3;
+const gpuMemcpyKind gpuMemcpyDefault        = 4;
+
+inline const char* gpuGetErrorName(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline const char* gpuGetErrorString(gpuError_t /*code*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuGetLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuPeekAtLastError(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuMalloc(void** /*devPtr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMallocHost(void** /*ptr*/, size_t /*size*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuHostAlloc(void** /*pHost*/, size_t /*size*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct gpuPos
+{
+   size_t x, y, z;
+};
+
+struct gpuPitchedPtr
+{
+   size_t pitch;
+   void* ptr;
+   size_t xsize;
+   size_t ysize;
+};
+
+struct gpuExtent
+{
+   size_t depth;
+   size_t height;
+   size_t width;
+};
+
+struct gpuArray;
+typedef struct gpuArray* gpuArray_t;
+typedef struct gpuArray* gpuArray_const_t;
+
+struct CUstream_st;
+typedef struct CUstream_st* gpuStream_t;
+inline gpuError_t gpuStreamDestroy(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreateWithPriority(gpuStream_t* /*pStream*/, unsigned int /*flags*/, int /*priority*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreateWithFlags(gpuStream_t* /*pStream*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuDeviceGetStreamPriorityRange(int* /*leastPriority*/, int* /*greatestPriority*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamCreate(gpuStream_t* /*pStream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamSynchronize(gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct gpuMemcpy3DParms
+{
+   gpuArray_t dstArray;
+   gpuPos dstPos;
+   gpuPitchedPtr dstPtr;
+   gpuExtent extent;
+   gpuMemcpyKind kind;
+   gpuArray_t srcArray;
+   gpuPos srcPos;
+   gpuPitchedPtr srcPtr;
+};
+
+inline gpuError_t gpuMemcpy(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpyAsync(void* /*dst*/, const void* /*src*/, size_t /*count*/, gpuMemcpyKind /*kind*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpy3D(const gpuMemcpy3DParms* /*p*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuMemcpy3DAsync(const gpuMemcpy3DParms* /*p*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuPos make_gpuPos(size_t /*x*/, size_t /*y*/, size_t /*z*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuPitchedPtr make_gpuPitchedPtr (void* /*d*/, size_t /*p*/, size_t /*xsz*/, size_t /*ysz*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuExtent make_gpuExtent(size_t /*w*/, size_t /*h*/, size_t /*d*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuFree(void* /*devPtr*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuFreeHost(void* /*ptr*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+inline gpuError_t gpuDeviceSynchronize(void) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct CUevent_st;
+typedef struct CUevent_st* gpuEvent_t;
+inline gpuError_t gpuEventCreate(gpuEvent_t* /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventCreateWithFlags(gpuEvent_t* /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventRecord(gpuEvent_t /*event*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuEventDestroy(gpuEvent_t /*event*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuStreamWaitEvent (gpuStream_t /*stream*/, gpuEvent_t /*event*/, unsigned int /*flags*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#define gpuStreamDefault 0x00
+
+inline gpuError_t gpuGetDeviceCount(int* /*count*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+inline gpuError_t gpuSetDevice(int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct CUuuid_st
+{
+   char bytes;
+};
+typedef struct CUuuid_st gpuUUID_t;
+
+struct gpuDeviceProp
+{
+   char name[256];
+   gpuUUID_t uuid;
+   size_t totalGlobalMem;
+   size_t sharedMemPerBlock;
+   int regsPerBlock;
+   int warpSize;
+   size_t memPitch;
+   int maxThreadsPerBlock;
+   int maxThreadsDim[3];
+   int maxGridSize[3];
+   int clockRate;
+   size_t totalConstMem;
+   int major;
+   int minor;
+   size_t textureAlignment;
+   size_t texturePitchAlignment;
+   int deviceOverlap;
+   int multiProcessorCount;
+   int kernelExecTimeoutEnabled;
+   int integrated;
+   int canMapHostMemory;
+   int computeMode;
+   int maxTexture1D;
+   int maxTexture1DMipmap;
+   int maxTexture1DLinear;
+   int maxTexture2D[2];
+   int maxTexture2DMipmap[2];
+   int maxTexture2DLinear[3];
+   int maxTexture2DGather[2];
+   int maxTexture3D[3];
+   int maxTexture3DAlt[3];
+   int maxTextureCubemap;
+   int maxTexture1DLayered[2];
+   int maxTexture2DLayered[3];
+   int maxTextureCubemapLayered[2];
+   int maxSurface1D;
+   int maxSurface2D[2];
+   int maxSurface3D[3];
+   int maxSurface1DLayered[2];
+   int maxSurface2DLayered[3];
+   int maxSurfaceCubemap;
+   int maxSurfaceCubemapLayered[2];
+   size_t surfaceAlignment;
+   int concurrentKernels;
+   int ECCEnabled;
+   int pciBusID;
+   int pciDeviceID;
+   int pciDomainID;
+   int tccDriver;
+   int asyncEngineCount;
+   int unifiedAddressing;
+   int memoryClockRate;
+   int memoryBusWidth;
+   int l2CacheSize;
+   int persistingL2CacheMaxSize;
+   int maxThreadsPerMultiProcessor;
+   int streamPrioritiesSupported;
+   int globalL1CacheSupported;
+   int localL1CacheSupported;
+   size_t sharedMemPerMultiprocessor;
+   int regsPerMultiprocessor;
+   int managedMemory;
+   int isMultiGpuBoard;
+   int multiGpuBoardGroupID;
+   int singleToDoublePrecisionPerfRatio;
+   int pageableMemoryAccess;
+   int concurrentManagedAccess;
+   int computePreemptionSupported;
+   int canUseHostPointerForRegisteredMem;
+   int cooperativeLaunch;
+   int cooperativeMultiDeviceLaunch;
+   int pageableMemoryAccessUsesHostPageTables;
+   int directManagedMemAccessFromHost;
+   int accessPolicyMaxWindowSize;
+};
+inline gpuError_t gpuGetDeviceProperties(gpuDeviceProp* /*prop*/, int /*device*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+struct uint3
+{
+   unsigned int x, y, z;
+};
+typedef struct uint3 uint3;
+
+struct dim3
+{
+   unsigned int x, y, z;
+   dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+   dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+   operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+};
+typedef struct dim3 dim3;
+
+inline gpuError_t gpuLaunchKernel(const void* /*func*/, dim3 /*gridDim*/, dim3 /*blockDim*/, void** /*args*/, size_t /*sharedMem*/, gpuStream_t /*stream*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#ifdef _WIN32
+#define GPURT_CB __stdcall
+#else
+#define GPURT_CB
+#endif
+
+typedef void(GPURT_CB* gpuHostFn_t)(void* /*userData*/);
+inline gpuError_t gpuLaunchHostFunc(gpuStream_t /*stream*/, gpuHostFn_t /*fn*/, void* /*userData*/) { WALBERLA_DEVICE_FUNCTION_ERROR }
+
+#undef WALBERLA_DEVICE_FUNCTION_ERROR
+
+} // namespace gpustubs
+using namespace gpustubs;
+
+} // namespace walberla
+
+
+#endif // WALBERLA_BUILD_WITH_GPU_SUPPORT
+
+/// \endcond
diff --git a/src/gpu/ErrorChecking.h b/src/gpu/ErrorChecking.h
new file mode 100644
index 0000000000000000000000000000000000000000..7031a936f5787fd64bd92b93d72e9d0d35b4f0e0
--- /dev/null
+++ b/src/gpu/ErrorChecking.h
@@ -0,0 +1,71 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ErrorChecking.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/Abort.h"
+
+#include <sstream>
+
+#include "gpu/DeviceWrapper.h"
+
+namespace walberla {
+namespace gpu {
+
+
+#define WALBERLA_GPU_CHECK(ans) { ::walberla::gpu::checkForError((ans), __FILE__, __LINE__); }
+#define WALBERLA_GPU_CHECK_LAST_ERROR() { ::walberla::gpu::checkForLastError(__FILE__, __LINE__); }
+
+
+
+inline void checkForError( gpuError_t code, const std::string & callerPath, const int line )
+{
+   // Oftentimes CUDA functions return an error code (if error has occurred) This function converts the error string in human-readable output.
+   // For general error checking use checkForLastError
+  if(code != gpuSuccess)
+  {
+    std::stringstream ss;
+    ss << "GPU Error: " << code << " " << gpuGetErrorName(code) << ": " << gpuGetErrorString( code );
+    Abort::instance()->abort( ss.str(), callerPath, line );
+  }
+}
+
+#ifndef NDEBUG
+inline void checkForLastError( const std::string & callerPath, const int line )
+{
+   // Forces immediate checking with a synchronizing. This breaks asynchrony/concurrency structure. Thus, only in debug mode executed.
+   gpuError_t code = gpuGetLastError();
+   if(code != gpuSuccess)
+   {
+      std::stringstream ss;
+      ss << "CUDA Error: " << code << " " << gpuGetErrorName(code) << ": " << gpuGetErrorString( code );
+      Abort::instance()->abort( ss.str(), callerPath, line );
+   }
+}
+#else
+inline void checkForLastError( const std::string & /*callerPath*/, const int /*line*/ ){}
+#endif
+
+
+} // namespace gpu
+} // namespace walberla
+
+
diff --git a/src/cuda/FieldAccessor.h b/src/gpu/FieldAccessor.h
similarity index 97%
rename from src/cuda/FieldAccessor.h
rename to src/gpu/FieldAccessor.h
index c2c676d1d00a709cda8428d903914c5718999832..cd50cc58d6e1c6ef708a1cc50e7fbcc897933281 100644
--- a/src/cuda/FieldAccessor.h
+++ b/src/gpu/FieldAccessor.h
@@ -14,19 +14,20 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessor.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include "core/DataTypes.h"
-
+#include "gpu/GPUWrapper.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
@@ -108,7 +109,7 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldAccessor3D.h b/src/gpu/FieldAccessor3D.h
similarity index 94%
rename from src/cuda/FieldAccessor3D.h
rename to src/gpu/FieldAccessor3D.h
index 411b64813e0b73531716cd09759852fabc5952e1..66e95f7242ce0c183ab261f7dcd1600a57df10bb 100644
--- a/src/cuda/FieldAccessor3D.h
+++ b/src/gpu/FieldAccessor3D.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessor3D.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -23,11 +23,9 @@
 
 #include "core/DataTypes.h"
 
-#include <cuda_runtime.h>
-
-
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename T>
@@ -43,7 +41,7 @@ namespace cuda {
                        const dim3 & idxDim,
                        const dim3 & blockDim )
             : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
-              idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false )
+              idxDim_( idxDim ), blockDim_( blockDim )
       {}
 
       __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
@@ -89,12 +87,12 @@ namespace cuda {
       uint_t fOffset_;
       dim3 idxDim_;
       dim3 blockDim_;
-      bool isValidPosition_;
+      bool isValidPosition_{ false };
    };
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldAccessorXYZ.h b/src/gpu/FieldAccessorXYZ.h
similarity index 96%
rename from src/cuda/FieldAccessorXYZ.h
rename to src/gpu/FieldAccessorXYZ.h
index 4e43dd1996600d1f24d3249104ef289eb9d8cfde..d9046a783d3b5c073b03527332cff553edf1e99b 100644
--- a/src/cuda/FieldAccessorXYZ.h
+++ b/src/gpu/FieldAccessorXYZ.h
@@ -14,20 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldAccessorXYZ.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-
-#include <cuda_runtime.h>
-
 #include "core/DataTypes.h"
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename T>
@@ -73,7 +72,7 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/FieldCopy.h b/src/gpu/FieldCopy.h
new file mode 100644
index 0000000000000000000000000000000000000000..6895661ecac9f983c2a08f16fad5bf991908dfc3
--- /dev/null
+++ b/src/gpu/FieldCopy.h
@@ -0,0 +1,197 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldCopy.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "ErrorChecking.h"
+#include "GPUField.h"
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/Field.h"
+#include "field/GhostLayerField.h"
+
+#include "core/Abort.h"
+#include "core/logging/Logging.h"
+
+
+namespace walberla {
+namespace gpu
+{
+
+
+   template<typename DstType, typename SrcType>
+   void fieldCpy( const shared_ptr< StructuredBlockStorage > & blocks,  BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      for ( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+               DstType * dst = blockIt->getData<DstType>( dstID );
+         const SrcType * src = blockIt->getData<SrcType>( srcID );
+         fieldCpy( *dst, *src );
+      }
+   }
+
+   template<typename DstType, typename SrcType>
+   std::function<void()> fieldCpyFunctor( const shared_ptr< StructuredBlockStorage > & blocks,
+                                            BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      return std::bind( fieldCpy<DstType,SrcType>, blocks, dstID, srcID );
+   }
+
+
+
+   template<typename DstType, typename SrcType>
+   void fieldCpySweepFunction( BlockDataID dstID, ConstBlockDataID srcID, IBlock * block )
+   {
+            DstType * dst = block->getData<DstType>( dstID );
+      const SrcType * src = block->getData<SrcType>( srcID );
+      fieldCpy( *dst, *src );
+   }
+
+   template<typename DstType, typename SrcType>
+   std::function<void(IBlock*)> fieldCpyFunctor( BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      return std::bind( fieldCpySweepFunction<DstType,SrcType>, dstID, srcID, std::placeholders::_1 );
+   }
+
+
+
+
+
+   template<typename T, uint_t fs>
+   void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src );
+
+
+
+   template<typename T, uint_t fs>
+   void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src );
+
+
+
+
+   //===================================================================================================================
+   //
+   //  Implementation
+   //
+   //===================================================================================================================
+
+
+
+
+   template<typename T, uint_t fs>
+   void fieldCpy(gpu::GPUField<T> & dst, const field::Field<T,fs> & src )
+   {
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuMemcpy3DParms p;
+         memset(&p, 0, sizeof(p));
+
+         if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") }
+
+         bool canCopy =
+            (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() &&
+             dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) ||
+            (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() &&
+             dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize());
+
+         if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") }
+
+         if (dst.layout() == fzyx)
+         {
+            p.srcPtr = make_gpuPitchedPtr((void*) (src.data()),         // pointer
+                                          sizeof(T) * src.xAllocSize(), // pitch
+                                          src.xAllocSize(),             // inner dimension size
+                                          src.yAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T);
+            p.extent.height = dst.yAllocSize();
+            p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+         }
+         else
+         {
+            p.srcPtr = make_gpuPitchedPtr((void*) (src.data()),         // pointer
+                                          sizeof(T) * src.fAllocSize(), // pitch
+                                          src.fAllocSize(),             // inner dimension size
+                                          src.xAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T);
+            p.extent.height = dst.xAllocSize();
+            p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+         }
+
+         p.dstPtr = dst.pitchedPtr();
+         p.kind   = gpuMemcpyHostToDevice;
+         WALBERLA_GPU_CHECK(gpuMemcpy3D(&p))
+      }
+   }
+
+
+
+   template<typename T, uint_t fs>
+   void fieldCpy( field::Field<T,fs> & dst, const gpu::GPUField<T> & src )
+   {
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuMemcpy3DParms p;
+         memset(&p, 0, sizeof(p));
+
+         if (dst.layout() != src.layout()) { WALBERLA_ABORT("Cannot copy fields with different layout") }
+
+         bool canCopy =
+            (src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() &&
+             dst.yAllocSize() == src.yAllocSize() && dst.xSize() == src.xSize()) ||
+            (src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() &&
+             dst.xAllocSize() == src.xAllocSize() && dst.fSize() == src.fSize());
+
+         if (!canCopy) { WALBERLA_ABORT("Field have to have the same size ") }
+
+         if (dst.layout() == fzyx)
+         {
+            p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()),         // pointer
+                                          sizeof(T) * dst.xAllocSize(), // pitch
+                                          dst.xAllocSize(),             // inner dimension size
+                                          dst.yAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.xAllocSize(), src.xAllocSize()) * sizeof(T);
+            p.extent.height = dst.yAllocSize();
+            p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+         }
+         else
+         {
+            p.dstPtr = make_gpuPitchedPtr((void*) (dst.data()),         // pointer
+                                          sizeof(T) * dst.fAllocSize(), // pitch
+                                          dst.fAllocSize(),             // inner dimension size
+                                          dst.xAllocSize());            // next outer dimension size
+
+            p.extent.width  = std::min(dst.fAllocSize(), src.fAllocSize()) * sizeof(T);
+            p.extent.height = dst.xAllocSize();
+            p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+         }
+
+         p.srcPtr = src.pitchedPtr();
+         p.kind   = gpuMemcpyDeviceToHost;
+         WALBERLA_GPU_CHECK(gpuMemcpy3D(&p))
+      }
+   }
+
+} // namespace gpu
+} // namespace walberla
+
+
diff --git a/src/gpu/FieldIndexing.h b/src/gpu/FieldIndexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..51b337e61237690ddc5163113abeb47ee44691b1
--- /dev/null
+++ b/src/gpu/FieldIndexing.h
@@ -0,0 +1,94 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldIndexing.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "stencil/Directions.h"
+#include "gpu/DeviceWrapper.h"
+
+#include "FieldAccessor.h"
+
+namespace walberla
+{
+namespace cell
+{
+class CellInterval;
+}
+} // namespace walberla
+
+namespace walberla
+{
+namespace gpu
+{
+
+// Forward Declarations
+template< typename T >
+class GPUField;
+
+template< typename T >
+class FieldIndexing
+{
+ public:
+   //** Kernel call        ******************************************************************************************
+   /*! \name Kernel call  */
+   //@{
+   dim3 blockDim() const { return blockDim_; }
+   dim3 gridDim() const { return gridDim_; }
+
+   const FieldAccessor< T >& gpuAccess() const { return gpuAccess_; }
+   //@}
+   //****************************************************************************************************************
+
+   //** Creation        *********************************************************************************************
+   /*! \name Creation  */
+   //@{
+   static FieldIndexing< T > interval(const GPUField< T >& f, const cell::CellInterval& ci, int fBegin = 0,
+                                      int fEnd = 1);
+
+   static FieldIndexing< T > xyz(const GPUField< T >& f);
+   static FieldIndexing< T > withGhostLayerXYZ(const GPUField< T >& f, uint_t numGhostLayers);
+   static FieldIndexing< T > ghostLayerOnlyXYZ(const GPUField< T >& f, uint_t thickness, stencil::Direction dir,
+                                               bool fullSlice = false);
+   static FieldIndexing< T > sliceBeforeGhostLayerXYZ(const GPUField< T >& f, uint_t thickness, stencil::Direction dir,
+                                                      bool fullSlice = false);
+   static FieldIndexing< T > sliceXYZ(const GPUField< T >& f, cell_idx_t distance, uint_t thickness,
+                                      stencil::Direction dir, bool fullSlice = false);
+
+   static FieldIndexing< T > allInner(const GPUField< T >& f);
+   static FieldIndexing< T > allWithGhostLayer(const GPUField< T >& f);
+   static FieldIndexing< T > all(const GPUField< T >& f, const cell::CellInterval& ci);
+   //@}
+   //****************************************************************************************************************
+
+ protected:
+   FieldIndexing(const GPUField< T >& field, dim3 _blockDim, dim3 _gridDim, const FieldAccessor< T > _gpuAccess);
+
+   const GPUField< T >& field_;
+   dim3 blockDim_;
+   dim3 gridDim_;
+   FieldAccessor< T > gpuAccess_;
+};
+
+} // namespace gpu
+} // namespace walberla
+
+#include "FieldIndexing.impl.h"
diff --git a/src/cuda/FieldIndexing.impl.h b/src/gpu/FieldIndexing.impl.h
similarity index 90%
rename from src/cuda/FieldIndexing.impl.h
rename to src/gpu/FieldIndexing.impl.h
index 7acafcdbbb9e21bf81d9bd13511d2f348f56fc4d..a8c9feccfbed0e12b015fe37dadac4aeaa803450 100644
--- a/src/cuda/FieldIndexing.impl.h
+++ b/src/gpu/FieldIndexing.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -26,13 +26,12 @@
 #include "core/debug/Debug.h"
 #include "field/Layout.h"
 
-#include <cuda_runtime.h>
-
 #include <limits>
 #include <cmath>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 template< typename T>
 FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
@@ -45,17 +44,21 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
-      int count;
-      cudaGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
    }
 }
 
@@ -91,7 +94,10 @@ void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::Ind
 template< typename T>
 FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd )
 {
-   uint_t xOffset, yOffset, zOffset, fOffset;
+   uint_t xOffset;
+   uint_t yOffset;
+   uint_t zOffset;
+   uint_t fOffset;
 
    if ( f.layout() == field::zyxf )
    {
@@ -222,7 +228,7 @@ FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::Cell
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexing3D.h b/src/gpu/FieldIndexing3D.h
similarity index 98%
rename from src/cuda/FieldIndexing3D.h
rename to src/gpu/FieldIndexing3D.h
index ba93f83c6148de01e22c8d8284a441228c98e8be..dc6776cbbd3f7ea2824e8200aaf20fc8fe165eb6 100644
--- a/src/cuda/FieldIndexing3D.h
+++ b/src/gpu/FieldIndexing3D.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing3D.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
 //
@@ -23,14 +23,13 @@
 #pragma once
 
 #include "FieldAccessor3D.h"
-
 #include "stencil/Directions.h"
-#include <cuda_runtime.h>
 
 namespace walberla { namespace cell {  class CellInterval;  } }
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    // Forward Declarations
    template< typename T> class GPUField;
@@ -99,7 +98,7 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexing3D.impl.h b/src/gpu/FieldIndexing3D.impl.h
similarity index 82%
rename from src/cuda/FieldIndexing3D.impl.h
rename to src/gpu/FieldIndexing3D.impl.h
index 8a3a0f230388f41fcef091af4b4eef9013de2f39..5aa027872d08d73da4315115c63c32344ba32702 100644
--- a/src/cuda/FieldIndexing3D.impl.h
+++ b/src/gpu/FieldIndexing3D.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexing3D.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -27,13 +27,12 @@
 #include "core/logging/Logging.h"
 #include "field/Layout.h"
 
-#include <cuda_runtime.h>
-
 #include <limits>
 #include <cmath>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 // Returns ( a % b != 0 ) ? ( a / b + 1 ) : ( a / b )
 inline unsigned int iDivUp( unsigned int a, unsigned int b ) { return ( a + b - 1 ) / b; }
@@ -53,26 +52,33 @@ FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
-      int count;
-      cudaGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
    }
 }
 
 template< typename T>
 FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const CellInterval & ci )
 {
-   uint_t xOffset, yOffset, zOffset, fOffset;
+   uint_t xOffset;
+   uint_t yOffset;
+   uint_t zOffset;
+   uint_t fOffset;
 
-   WALBERLA_ASSERT( f.layout() == field::fzyx );
+   WALBERLA_ASSERT( f.layout() == field::fzyx )
 
    xOffset = sizeof(T);
    yOffset = f.pitchedPtr().pitch;
@@ -89,9 +95,9 @@ FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const Ce
 
 
    dim3 idxDim( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
-   unsigned int bx = std::min( preferredBlockDim_.x, idxDim.x );
-   unsigned int by = std::min( preferredBlockDim_.y, idxDim.y );
-   unsigned int bz = std::min( preferredBlockDim_.z, idxDim.z );
+   unsigned int const bx = std::min( preferredBlockDim_.x, idxDim.x );
+   unsigned int const by = std::min( preferredBlockDim_.y, idxDim.y );
+   unsigned int const bz = std::min( preferredBlockDim_.z, idxDim.z );
    dim3 gridDim( iDivUp( idxDim.x, bx ),
                  iDivUp( idxDim.y, by ),
                  iDivUp( idxDim.z, bz ) );
@@ -160,7 +166,7 @@ FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexingXYZ.h b/src/gpu/FieldIndexingXYZ.h
similarity index 96%
rename from src/cuda/FieldIndexingXYZ.h
rename to src/gpu/FieldIndexingXYZ.h
index 18a6e2645b15ad296c984674ee565b3ad7e2eb89..f62161bd1a00f3ee32b8d46f34f885b5a247a8ce 100644
--- a/src/cuda/FieldIndexingXYZ.h
+++ b/src/gpu/FieldIndexingXYZ.h
@@ -14,22 +14,24 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexingXYZ.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "FieldAccessorXYZ.h"
-#include <cuda_runtime.h>
+#include "core/DataTypes.h"
 
+#include "DeviceWrapper.h"
+#include "FieldAccessorXYZ.h"
 
 namespace walberla { namespace cell {  class CellInterval;  } }
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 // Forward Declarations
 template< typename T> class GPUField;
@@ -73,7 +75,7 @@ template< typename T> class GPUField;
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/FieldIndexingXYZ.impl.h b/src/gpu/FieldIndexingXYZ.impl.h
similarity index 81%
rename from src/cuda/FieldIndexingXYZ.impl.h
rename to src/gpu/FieldIndexingXYZ.impl.h
index 6053e4ee8577e0687b4443f60a9bcfd1c4c1bf9c..d75560b9dff84fec9fa42bf910e735aa5106cc0d 100644
--- a/src/cuda/FieldIndexingXYZ.impl.h
+++ b/src/gpu/FieldIndexingXYZ.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file FieldIndexingXYZ.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -26,7 +26,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template< typename T>
@@ -40,24 +41,31 @@ FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field,
 {
    WALBERLA_DEBUG_SECTION()
    {
-      cudaDeviceProp prop;
-      int count;
-      cudaGetDeviceCount(&count);
-      int threadsPerBlock = std::numeric_limits<int>::max();
-      for (int i = 0; i < count; i++) {
-         cudaGetDeviceProperties(&prop, i);
-         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      WALBERLA_DEVICE_SECTION()
+      {
+         gpuDeviceProp prop;
+         int count;
+         gpuGetDeviceCount(&count);
+         int threadsPerBlock = std::numeric_limits< int >::max();
+         for (int i = 0; i < count; i++)
+         {
+            gpuGetDeviceProperties(&prop, i);
+            threadsPerBlock = std::min(prop.maxThreadsPerBlock, threadsPerBlock);
+         }
+         WALBERLA_ASSERT_LESS(int_c(blockDim_.x), threadsPerBlock,
+                              "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller "
+                                 << "than the maximal thread count per GPU block.")
       }
-      WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
-                            "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
-                            "than the maximal thread count per CUDA block." );
    }
 }
 
 template< typename T>
 FieldIndexingXYZ<T> FieldIndexingXYZ<T>::interval ( const GPUField<T> & f, const CellInterval & ci )
 {
-   size_t xOffset, yOffset, zOffset, fOffset;
+   size_t xOffset;
+   size_t yOffset;
+   size_t zOffset;
+   size_t fOffset;
 
    if ( f.layout() == field::zyxf )
    {
@@ -114,7 +122,7 @@ FieldIndexingXYZ<T> FieldIndexingXYZ<T>::withGhostLayerXYZ( const GPUField<T> &
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/GPUCopy.cpp b/src/gpu/GPUCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7406048d916885cddd88abad3d6feff4ef1818f1
--- /dev/null
+++ b/src/gpu/GPUCopy.cpp
@@ -0,0 +1,397 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUCopy.cpp
+//! \ingroup gpu
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Copy routines of 4D intervals involving GPU buffers.
+//
+//======================================================================================================================
+
+#include "core/debug/Debug.h"
+
+#include "GPUCopy.h"
+
+#include <cstring>
+
+
+namespace walberla {
+namespace gpu
+{
+
+void copyDevToDevFZYX( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
+
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyDeviceToDevice;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using hipMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+
+void copyDevToDevZYXF( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+      p.kind = gpuMemcpyDeviceToDevice;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyHostToDevFZYX( const gpuPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyHostToDevice;
+
+      if (copyStream == nullptr)
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+void copyHostToDevZYXF( const gpuPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+         gpuMemcpy3DParms p;
+         std::memset( &p, 0, sizeof(p) );
+
+         p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+         p.srcPtr = make_gpuPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
+
+         p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+         p.dstPtr = make_gpuPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+         p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+         p.kind = gpuMemcpyHostToDevice;
+
+         if ( copyStream == nullptr )
+         {
+            WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+         }
+         else
+         {
+            // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+            WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+         }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostFZYX( unsigned char* dst, const gpuPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_gpuPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.extent = make_gpuExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = gpuMemcpyDeviceToHost;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostZYXF( unsigned char* dst, const gpuPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        gpuStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize);
+   const uint_t & Ny = std::get<1>(intervalSize);
+   const uint_t & Nz = std::get<2>(intervalSize);
+   const uint_t & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset);
+   const uint_t & srcY = std::get<1>(srcOffset);
+   const uint_t & srcZ = std::get<2>(srcOffset);
+   const uint_t & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset);
+   const uint_t & dstY = std::get<1>(dstOffset);
+   const uint_t & dstZ = std::get<2>(dstOffset);
+   const uint_t & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      gpuMemcpy3DParms p;
+      std::memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_gpuPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_gpuPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_gpuPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_gpuPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
+
+      p.extent = make_gpuExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+
+      p.kind = gpuMemcpyDeviceToHost;
+
+      if ( copyStream == nullptr )
+      {
+         WALBERLA_GPU_CHECK( gpuMemcpy3D(&p) )
+      }
+      else
+      {
+         // Using gpuMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_GPU_CHECK( gpuMemcpy3DAsync(&p, copyStream) )
+      }
+   };
+
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+} // namespace gpu
+} // namespace walberla
diff --git a/src/cuda/GPUCopy.h b/src/gpu/GPUCopy.h
similarity index 86%
rename from src/cuda/GPUCopy.h
rename to src/gpu/GPUCopy.h
index 775d705b384520bbecfae4f0a347f2e541731902..08e94a38a9881ad6e81d25c3e0bacce307ff9dfb 100644
--- a/src/cuda/GPUCopy.h
+++ b/src/gpu/GPUCopy.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
@@ -25,12 +25,15 @@
 
 #include "core/DataTypes.h"
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/DeviceWrapper.h"
+
 #include <tuple>
-#include <cuda_runtime.h>
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 //****************************************************************************************************************************
@@ -44,14 +47,14 @@ namespace cuda {
  * \param srcAllocSizeZ allocation size in z direction of the source buffer
  * \param typeSize      size of an f element
  * \param intervalSize  interval size
- * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ * \param copyStream    CUDA/HIP stream, if not NULL copy operations will be performed asynchronously
  *****************************************************************************************************************************/
-void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+void copyDevToDevFZYX( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream );
+                       gpuStream_t copyStream );
 
 //****************************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout.
@@ -64,58 +67,58 @@ void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
  * \param srcAllocSizeY allocation size in y direction of the source buffer
  * \param typeSize      size of an f element
  * \param intervalSize  interval size
- * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ * \param copyStream    CUDA/HIP stream, if not NULL copy operations will be performed asynchronously
  *****************************************************************************************************************************/
-void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+void copyDevToDevZYXF( const gpuPitchedPtr& dst, const gpuPitchedPtr& src,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                       cudaStream_t copyStream );
+                       gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See \ref copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+void copyHostToDevFZYX( const gpuPitchedPtr& dst, unsigned char* src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. See \ref copyDevToDevZYXF() for
  * parameter information.
  *******************************************************************************************************************/
-void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+void copyHostToDevZYXF( const gpuPitchedPtr& dst, unsigned char* src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See \ref copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+void copyDevToHostFZYX( unsigned char* dst, const gpuPitchedPtr& src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. See \ref copyDevToDevZYXF() for
  * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+void copyDevToHostZYXF( unsigned char* dst, const gpuPitchedPtr& src,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
                         uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
                         std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
-                        cudaStream_t copyStream );
+                        gpuStream_t copyStream );
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/GPUField.h b/src/gpu/GPUField.h
similarity index 80%
rename from src/cuda/GPUField.h
rename to src/gpu/GPUField.h
index 431ce15263f0bb460456983ff52299f18135d4d6..f8a0242ed3aa5e9de3606d8ff1737b4fe869f42f 100755
--- a/src/cuda/GPUField.h
+++ b/src/gpu/GPUField.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUField.h
-//! \ingroup moduleName
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -23,15 +23,16 @@
 
 #include "core/DataTypes.h"
 #include "core/cell/CellInterval.h"
-#include "field/Layout.h"
-#include "stencil/Directions.h"
 
-#include <cuda_runtime.h>
+#include "field/Layout.h"
 
+#include "stencil/Directions.h"
 
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    using field::Layout;
    using field::fzyx;
@@ -39,20 +40,20 @@ namespace cuda {
 
 
    //*******************************************************************************************************************
-   /*! GhostLayerField stored on a CUDA GPU
+   /*! GhostLayerField stored on a CUDA/HIP GPU
    *
-   *  Basically a wrapper around a CUDA device pointer together with size information about the field
+   *  Basically a wrapper around a CUDA/HIP device pointer together with size information about the field
    *  i.e. sizes in x,y,z,f directions and number of ghost layers.
    *
-   *  Internally represented by a cudaPitchedPtr which is allocated with cudaMalloc3D to take padding of the
+   *  Internally represented by a gpuPitchedPtr which is allocated with gpuMalloc3D to take padding of the
    *  innermost coordinate into account.
    *
    *  Supports Array-of-Structures (AoS,zyxf) layout and Structure-of-Arrays (SoA, fzyx) layout, in a similar way
    *  to field::Field
    *
-   *  To work with the GPUField look at the cuda::fieldCpy functions to transfer a field::Field to a cuda::GPUField
+   *  To work with the GPUField look at the gpu::fieldCpy functions to transfer a field::Field to a gpu::GPUField
    *  and vice versa.
-   *  When writing CUDA kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
+   *  When writing device kernels for GPUFields have a look at the FieldIndexing and FieldAccessor concepts.
    *  These simplify the "iteration" i.e. indexing of cells in GPUFields.
    */
    //*******************************************************************************************************************
@@ -63,7 +64,7 @@ namespace cuda {
       typedef T value_type;
 
       GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSize,
-                uint_t _nrOfGhostLayers, const Layout & _layout = zyxf, bool usePitchedMem = true );
+                uint_t _nrOfGhostLayers, const Layout & _layout = fzyx, bool usePitchedMem = true );
 
       ~GPUField();
 
@@ -71,7 +72,7 @@ namespace cuda {
 
       bool isPitchedMem() const { return usePitchedMem_; }
 
-      cudaPitchedPtr pitchedPtr() const { return pitchedPtr_; }
+      gpuPitchedPtr pitchedPtr() const { return pitchedPtr_; }
 
 
       inline uint_t  xSize() const  { return xSize_; }
@@ -124,9 +125,9 @@ namespace cuda {
       bool operator==( const GPUField & other ) const;
 
       void getGhostRegion( stencil::Direction d, CellInterval & ci,
-                           cell_idx_t thickness, bool fullSlice ) const;
+                           cell_idx_t thickness, bool fullSlice = false ) const;
       void getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci,
-                                    cell_idx_t thickness, bool fullSlice ) const
+                                    cell_idx_t thickness, bool fullSlice = false ) const
       {
          getSlice( d, ci, 0, thickness, fullSlice );
       }
@@ -139,8 +140,22 @@ namespace cuda {
       T       * dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f);
       const T * dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f) const;
 
+      //** TimestepInformation *****************************************************************************************
+      /*! \name TimestepCounter */
+      //@{
+      inline uint8_t advanceTimestep()
+      {
+         timestepCounter_ = (timestepCounter_ + 1) & 1;
+         return timestepCounter_;
+      }
+      inline uint8_t getTimestep() const { return timestepCounter_; }
+      inline uint8_t getTimestepPlusOne() const { return (timestepCounter_ + 1) & 1; }
+      inline bool isEvenTimeStep() const {return (((timestepCounter_) &1) ^ 1); }
+      //@}
+      //****************************************************************************************************************
+
    protected:
-      cudaPitchedPtr pitchedPtr_;
+      gpuPitchedPtr  pitchedPtr_;
       uint_t         nrOfGhostLayers_;
       uint_t         xSize_;
       uint_t         ySize_;
@@ -151,10 +166,11 @@ namespace cuda {
       uint_t         fAllocSize_;
       Layout         layout_;
       bool           usePitchedMem_;
+      uint8_t        timestepCounter_;
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/GPUField.impl.h b/src/gpu/GPUField.impl.h
similarity index 81%
rename from src/cuda/GPUField.impl.h
rename to src/gpu/GPUField.impl.h
index 5dd3e58409c95e4163e0b086832f69ab11cac136..dd42d088c77a3dc2c5eecddba4ae1895e31df5b3 100644
--- a/src/cuda/GPUField.impl.h
+++ b/src/gpu/GPUField.impl.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUField.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,10 +22,12 @@
 #include "GPUField.h"
 #include "ErrorChecking.h"
 #include "AlignedAllocation.h"
+#include "DeviceWrapper.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template<typename T>
@@ -33,42 +35,46 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
                        uint_t _nrOfGhostLayers, const Layout & _layout, bool usePitchedMem )
    : nrOfGhostLayers_( _nrOfGhostLayers ),
      xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ),
-     layout_( _layout ), usePitchedMem_( usePitchedMem )
+     layout_( _layout ), usePitchedMem_( usePitchedMem ), timestepCounter_(0)
 {
-   cudaExtent extent;
-   if ( layout_ == zyxf )
+   WALBERLA_NON_DEVICE_SECTION() {
+      WALBERLA_ABORT(__FUNCTION__ << "Instantiating GPU field without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+   }
+
+   gpuExtent extent;
+   if (layout_ == zyxf)
    {
       extent.width  = _fSize * sizeof(T);
-      extent.height = (_xSize + 2 * _nrOfGhostLayers );
-      extent.depth  = (_ySize + 2 * _nrOfGhostLayers ) * ( _zSize + 2 * _nrOfGhostLayers );
+      extent.height = (_xSize + 2 * _nrOfGhostLayers);
+      extent.depth  = (_ySize + 2 * _nrOfGhostLayers) * (_zSize + 2 * _nrOfGhostLayers);
    }
    else
    {
-      extent.width  = (_xSize + 2 * _nrOfGhostLayers ) * sizeof(T);
-      extent.height = (_ySize + 2 * _nrOfGhostLayers );
-      extent.depth  = (_zSize + 2 * _nrOfGhostLayers ) * _fSize;
+      extent.width  = (_xSize + 2 * _nrOfGhostLayers) * sizeof(T);
+      extent.height = (_ySize + 2 * _nrOfGhostLayers);
+      extent.depth  = (_zSize + 2 * _nrOfGhostLayers) * _fSize;
    }
 
-   if ( usePitchedMem_ )
+   if (usePitchedMem_)
    {
       size_t pitch;
       const size_t alignment = 256;
-      void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
-                                                 sizeof(T) * nrOfGhostLayers_ );
-      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
-      pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
+      void* mem = allocate_pitched_with_offset(pitch, extent.width, extent.height * extent.depth, alignment,
+                                               sizeof(T) * nrOfGhostLayers_);
+      WALBERLA_ASSERT_EQUAL(size_t((char*) (mem) + sizeof(T) * nrOfGhostLayers_) % alignment, 0)
+      pitchedPtr_ = make_gpuPitchedPtr(mem, pitch, extent.width, extent.height);
    }
    else
    {
-      pitchedPtr_ = make_cudaPitchedPtr( NULL, extent.width, extent.width, extent.height );
-      WALBERLA_CUDA_CHECK ( cudaMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) );
+      pitchedPtr_ = make_gpuPitchedPtr(nullptr, extent.width, extent.width, extent.height);
+      WALBERLA_GPU_CHECK(gpuMalloc(&pitchedPtr_.ptr, extent.width * extent.height * extent.depth))
    }
 
    // allocation size is stored in pitched pointer
    // pitched pointer stores the amount of padded region in bytes
    // but we keep track of the size in #elements
-   WALBERLA_ASSERT_EQUAL( pitchedPtr_.pitch % sizeof(T), 0 );
-   if ( layout_ == field::fzyx )
+   WALBERLA_ASSERT_EQUAL(pitchedPtr_.pitch % sizeof(T), 0)
+   if (layout_ == field::fzyx)
    {
       xAllocSize_ = pitchedPtr_.pitch / sizeof(T);
       fAllocSize_ = fSize_;
@@ -88,7 +94,7 @@ GPUField<T>::~GPUField()
       free_aligned_with_offset(pitchedPtr_.ptr );
    else
    {
-      WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
+      WALBERLA_GPU_CHECK( gpuFree( pitchedPtr_.ptr ) )
    }
 }
 
@@ -122,8 +128,8 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci,
                                    cell_idx_c( ySize() ),
                                    cell_idx_c( zSize() )};
 
-   WALBERLA_ASSERT_GREATER( thickness, 0 );
-   WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), nrOfGhostLayers() );
+   WALBERLA_ASSERT_GREATER( thickness, 0 )
+   WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), nrOfGhostLayers() )
    const cell_idx_t ghosts = cell_idx_c ( thickness );
 
    cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( nrOfGhostLayers() ) : 0;
@@ -162,7 +168,7 @@ template<typename T>
 void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci,
                            cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const
 {
-   WALBERLA_ASSERT_GREATER( thickness, 0 );
+   WALBERLA_ASSERT_GREATER( thickness, 0 )
 
    const cell_idx_t sizeArr [] = { cell_idx_c( xSize() ),
                                    cell_idx_c( ySize() ),
@@ -197,7 +203,7 @@ inline uint_t GPUField<T>::size( uint_t coord ) const
       case 1: return this->ySize();
       case 2: return this->zSize();
       case 3: return this->fSize();
-      default: WALBERLA_ASSERT(false); return 0;
+      default: WALBERLA_ASSERT(false) return 0;
    }
 }
 
@@ -227,7 +233,7 @@ bool GPUField<T>::hasSameAllocSize( const GPUField<T> & other ) const
 //*******************************************************************************************************************
 /*! Creates a new GPUField that has equal size, layout and memory type as this field but has uninitialized memory.
  *
- * \return a new FPUField that has to be freed by caller.
+ * \return a new GPUField that has to be freed by caller.
  *******************************************************************************************************************/
 template <typename T>
 GPUField<T> * GPUField<T>::cloneUninitialized() const
@@ -235,10 +241,10 @@ GPUField<T> * GPUField<T>::cloneUninitialized() const
    GPUField<T> * res = new GPUField<T>( xSize(), ySize(), zSize(), fSize(),
                                         nrOfGhostLayers(), layout(), isPitchedMem() );
 
-   WALBERLA_ASSERT( hasSameAllocSize( *res ) );
-   WALBERLA_ASSERT( hasSameSize( *res ) );
-   WALBERLA_ASSERT( layout() == res->layout() );
-   WALBERLA_ASSERT( isPitchedMem() == res->isPitchedMem() );
+   WALBERLA_ASSERT( hasSameAllocSize( *res ) )
+   WALBERLA_ASSERT( hasSameSize( *res ) )
+   WALBERLA_ASSERT( layout() == res->layout() )
+   WALBERLA_ASSERT( isPitchedMem() == res->isPitchedMem() )
    return res;
 }
 
@@ -292,16 +298,16 @@ uint_t GPUField<T>::fAllocSize() const
 template<typename T>
 void GPUField<T>::swapDataPointers( GPUField<T> & other )
 {
-   WALBERLA_ASSERT( hasSameAllocSize( other ) );
-   WALBERLA_ASSERT( hasSameSize( other ) );
-   WALBERLA_ASSERT( layout() == other.layout() );
-   WALBERLA_ASSERT( isPitchedMem() == other.isPitchedMem() );
+   WALBERLA_ASSERT( hasSameAllocSize( other ) )
+   WALBERLA_ASSERT( hasSameSize( other ) )
+   WALBERLA_ASSERT( layout() == other.layout() )
+   WALBERLA_ASSERT( isPitchedMem() == other.isPitchedMem() )
    std::swap( pitchedPtr_, other.pitchedPtr_ );
 }
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/gpu/GPURAII.h b/src/gpu/GPURAII.h
new file mode 100644
index 0000000000000000000000000000000000000000..815b3829114506a8c601669aa4195461bd60151a
--- /dev/null
+++ b/src/gpu/GPURAII.h
@@ -0,0 +1,108 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CudaRAII.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "ErrorChecking.h"
+
+namespace walberla
+{
+namespace gpu
+{
+
+class StreamRAII
+{
+ public:
+   ~StreamRAII()
+   {
+      if (stream_ != nullptr) { WALBERLA_GPU_CHECK(gpuStreamDestroy(stream_)) }
+   }
+
+   StreamRAII(StreamRAII&& other) noexcept
+   {
+      stream_       = other.stream_;
+      other.stream_ = nullptr;
+   }
+
+   StreamRAII(const StreamRAII&) = delete;
+
+   void operator=(const StreamRAII&) = delete;
+
+   operator gpuStream_t() const { return stream_; }
+
+   static StreamRAII defaultStream()
+   {
+      StreamRAII result;
+      result.stream_ = nullptr;
+      return result;
+   }
+
+   static StreamRAII newPriorityStream(int priority)
+   {
+      StreamRAII result;
+      WALBERLA_GPU_CHECK(gpuStreamCreateWithPriority(&result.stream_, gpuStreamDefault, priority))
+      return result;
+   }
+
+   static StreamRAII newStream()
+   {
+      StreamRAII result;
+      WALBERLA_GPU_CHECK(gpuStreamCreate(&result.stream_))
+      return result;
+   }
+
+ private:
+   StreamRAII() = default;
+
+   gpuStream_t stream_;
+};
+
+class EventRAII
+{
+ public:
+   explicit EventRAII()
+   {
+      event = gpuEvent_t();
+      WALBERLA_GPU_CHECK(gpuEventCreate(&event))
+   }
+
+   ~EventRAII()
+   {
+      if (event != gpuEvent_t()) { WALBERLA_GPU_CHECK(gpuEventDestroy(event)) }
+   }
+
+   EventRAII(const EventRAII&) = delete;
+
+   void operator=(const EventRAII&) = delete;
+
+   EventRAII(EventRAII&& other) noexcept
+   {
+      event       = other.event;
+      other.event = gpuEvent_t();
+   }
+
+   operator gpuEvent_t() const { return event; }
+
+ private:
+   gpuEvent_t event;
+};
+
+} // namespace gpu
+} // namespace walberla
\ No newline at end of file
diff --git a/src/gpu/GPUWrapper.h b/src/gpu/GPUWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..48fcc2e1064ce32c525eed2c43896195cf059784
--- /dev/null
+++ b/src/gpu/GPUWrapper.h
@@ -0,0 +1,137 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUWrapper.h
+//! \ingroup gpu
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+// https://rocmdocs.amd.com/en/latest/Programming_Guides/CUDAAPIHIPTEXTURE.html
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+    #include <cuda_runtime.h>
+
+
+    using gpuError_t = cudaError_t;
+    #define gpuSuccess cudaSuccess
+    #define gpuGetErrorName cudaGetErrorName
+    #define gpuGetErrorString cudaGetErrorString
+    #define gpuPeekAtLastError cudaPeekAtLastError
+    #define gpuGetLastError cudaGetLastError
+
+    #define gpuMalloc cudaMalloc
+    #define gpuMallocHost cudaMallocHost
+    #define gpuHostAllocDefault cudaHostAllocDefault
+    #define gpuHostAlloc cudaHostAlloc
+    #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+    #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+    #define gpuMemcpy cudaMemcpy
+    #define gpuMemcpyAsync cudaMemcpyAsync
+    #define gpuMemcpy3D cudaMemcpy3D
+    #define gpuMemcpy3DParms cudaMemcpy3DParms
+    #define gpuMemcpy3DAsync cudaMemcpy3DAsync
+
+    #define make_gpuPos make_cudaPos
+    #define make_gpuPitchedPtr make_cudaPitchedPtr
+    #define gpuPitchedPtr cudaPitchedPtr
+    #define make_gpuExtent make_cudaExtent
+    using gpuExtent = cudaExtent;
+
+    #define gpuFree cudaFree
+    #define gpuFreeHost cudaFreeHost
+
+    using gpuStream_t = cudaStream_t;
+    #define gpuStreamDestroy cudaStreamDestroy
+    #define gpuStreamCreateWithPriority cudaStreamCreateWithPriority
+    #define gpuDeviceGetStreamPriorityRange cudaDeviceGetStreamPriorityRange
+    #define gpuStreamCreate cudaStreamCreate
+    #define gpuStreamSynchronize cudaStreamSynchronize
+    #define gpuDeviceSynchronize cudaDeviceSynchronize
+
+    using gpuEvent_t = cudaEvent_t;
+    #define gpuEventCreate cudaEventCreate
+    #define gpuEventRecord cudaEventRecord
+    #define gpuEventDestroy cudaEventDestroy
+    #define gpuStreamWaitEvent cudaStreamWaitEvent
+    #define gpuStreamDefault cudaStreamDefault
+
+    #define gpuGetDeviceCount cudaGetDeviceCount
+    #define gpuSetDevice cudaSetDevice
+    #define gpuDeviceProp cudaDeviceProp
+    #define gpuGetDeviceProperties cudaGetDeviceProperties
+
+    #define gpuLaunchKernel cudaLaunchKernel
+#endif
+
+
+#ifdef WALBERLA_BUILD_WITH_HIP
+    #include <hip/hip_runtime.h>
+
+
+    using gpuError_t = hipError_t;
+    #define gpuSuccess hipSuccess
+    #define gpuGetErrorName hipGetErrorName
+    #define gpuGetErrorString hipGetErrorString
+    #define gpuPeekAtLastError hipPeekAtLastError
+    #define gpuGetLastError hipGetLastError
+
+    #define gpuMalloc hipMalloc
+    #define gpuMallocHost hipHostMalloc
+    #define gpuHostAllocDefault hipHostMallocDefault
+    // warning: 'hipHostAlloc' is deprecated: use hipHostMalloc insteadwarning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
+    #define gpuHostAlloc hipHostMalloc
+    #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+    #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+    #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+    #define gpuMemcpy hipMemcpy
+    #define gpuMemcpyAsync hipMemcpyAsync
+    #define gpuMemcpy3D hipMemcpy3D
+    #define gpuMemcpy3DParms hipMemcpy3DParms
+    #define gpuMemcpy3DAsync hipMemcpy3DAsync
+
+    #define make_gpuPitchedPtr make_hipPitchedPtr
+    #define make_gpuPos make_hipPos
+    using gpuPitchedPtr = hipPitchedPtr;
+    #define make_gpuExtent make_hipExtent
+    using gpuExtent = hipExtent;
+
+    #define gpuFree hipFree
+    #define gpuFreeHost hipHostFree
+
+    using gpuStream_t = hipStream_t;
+    #define gpuStreamDestroy hipStreamDestroy
+    #define gpuStreamCreateWithPriority hipStreamCreateWithPriority
+    #define gpuDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange
+    #define gpuStreamCreate hipStreamCreate
+    #define gpuStreamSynchronize hipStreamSynchronize
+    #define gpuDeviceSynchronize hipDeviceSynchronize
+
+    using gpuEvent_t = hipEvent_t;
+    #define gpuEventCreate hipEventCreate
+    #define gpuEventRecord hipEventRecord
+    #define gpuEventDestroy hipEventDestroy
+    #define gpuStreamWaitEvent hipStreamWaitEvent
+    #define gpuStreamDefault hipStreamDefault
+
+    #define gpuGetDeviceCount hipGetDeviceCount
+    #define gpuSetDevice hipSetDevice
+    #define gpuDeviceProp hipDeviceProp
+    #define gpuGetDeviceProperties hipGetDeviceProperties
+
+    #define gpuLaunchKernel hipLaunchKernel
+#endif
diff --git a/src/cuda/HostFieldAllocator.h b/src/gpu/HostFieldAllocator.h
similarity index 58%
rename from src/cuda/HostFieldAllocator.h
rename to src/gpu/HostFieldAllocator.h
index 7276c495db75933112a1af697c3181274fd7b450..8b24c3a47cd06dbf97a5ef07177a4f152f3fb509 100644
--- a/src/cuda/HostFieldAllocator.h
+++ b/src/gpu/HostFieldAllocator.h
@@ -14,68 +14,81 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file HostFieldAllocator.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
-//! \brief Allocator that allocates a CPU! field using cudaHostAlloc
+//! \brief Allocator that allocates a CPU! field using gpuHostAlloc
 //
 //======================================================================================================================
 
 #pragma once
 
-#include "ErrorChecking.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/DeviceWrapper.h"
 #include "field/allocation/FieldAllocator.h"
 
-#include <cuda_runtime.h>
-
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    //*******************************************************************************************************************
    /*!
-   * Allocator that allocates a CPU! field using cudaHostAlloc without padding
+   * Allocator that allocates a CPU! field using gpuHostAlloc without padding
    *
-   * Uses cudaHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU
-   * This allocator should be used for CPU fields that are often transfered to GPU and back
+   * Uses gpuHostAlloc for the allocation - which allocates page-locked memory that is faster to transfer to the GPU
+   * This allocator should be used for CPU fields that are often transferred to GPU and back
    *
-   * \ingroup cuda
+   * \ingroup gpu
    *
    */
    //*******************************************************************************************************************
-   template<typename T, unsigned int cudaHostAllocFlags = cudaHostAllocDefault>
+   template<typename T, unsigned int HostAllocFlags = gpuHostAllocDefault>
    class HostFieldAllocator : public field::FieldAllocator<T>
    {
    public:
-      virtual ~HostFieldAllocator() {}
+      virtual ~HostFieldAllocator() = default;
 
       virtual T * allocateMemory (  uint_t size0, uint_t size1, uint_t size2, uint_t size3,
                                     uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 )
       {
+         WALBERLA_NON_DEVICE_SECTION()
+         {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+
          allocSize1=size1;
          allocSize2=size2;
          allocSize3=size3;
-         void * result;
-         WALBERLA_CUDA_CHECK( cudaHostAlloc( &result, size0*size1*size2*size3*sizeof(T), cudaHostAllocFlags ) );
+         void * result = nullptr;
+         WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size0 * size1 * size2 * size3 * sizeof(T), HostAllocFlags))
          return (T*)(result);
       }
 
       virtual T * allocateMemory ( uint_t size )
       {
-         T* result;
-         cudaHostAlloc( &result, size*sizeof(T), cudaHostAllocFlags );
-         return result;
+         WALBERLA_NON_DEVICE_SECTION()
+         {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+
+         void * result = nullptr;
+         WALBERLA_GPU_CHECK(gpuHostAlloc(&result, size*sizeof(T), HostAllocFlags))
+         return (T*)(result);
       }
 
       virtual void deallocate(T *& values) {
-         WALBERLA_CUDA_CHECK( cudaFreeHost( values ) );
+         WALBERLA_NON_DEVICE_SECTION() {
+            WALBERLA_ABORT(__FUNCTION__ << "Using GPU method without WALBERLA_BUILD_WITH_GPU_SUPPORT being enabled.")
+         }
+         WALBERLA_GPU_CHECK(gpuFreeHost(values))
       }
    };
 
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/cuda/Kernel.h b/src/gpu/Kernel.h
similarity index 84%
rename from src/cuda/Kernel.h
rename to src/gpu/Kernel.h
index cb69aa4fb238200c1bd332b9f2fbc2702f19eb97..f6c2eb687a1d54e6aea2b21f80bfd200d05b371f 100644
--- a/src/cuda/Kernel.h
+++ b/src/gpu/Kernel.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file Kernel.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -25,20 +25,20 @@
 #include "core/debug/Debug.h"
 #include "core/FunctionTraits.h"
 
+#include "gpu/GPUWrapper.h"
 #include "ErrorChecking.h"
 
-#include <cuda_runtime.h>
 #include <type_traits>
 #include <vector>
 
 
-
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    //*******************************************************************************************************************
-   /*! Wrapper class around a CUDA kernel, to call kernels also from code not compiled with nvcc
+   /*! Wrapper class around a GPU kernel, to call kernels also from code not compiled with the device compiler
    *
    * Example:
    * \code
@@ -55,10 +55,8 @@ namespace cuda {
    * \endcode
    *
    * Why use this strange wrapper class instead of the nice kernel call syntax "<<<griddim, blockdim >>>" ??
-   *     - This syntax is nice but has to be compiled with nvcc, which does not (yet) understand C++11
-   *     - C++11 features are used all over the place in waLBerla code
-   *     - all *.cu files and headers included in *.cu files have to be "C++11 free"
-   *     - thus there should be as few code as possible in *.cu files
+   *     - This syntax is nice but has to be compiled with the device compiler
+   *     - The wrapper allows to compile the kernel call with the host compiler
    *
    * Drawbacks of this class compared to kernel call syntax:
    * Type checking of parameters can only be done at runtime (is done only in Debug mode!).
@@ -75,17 +73,17 @@ namespace cuda {
          // this code is equivalent to:
          kernel_func<<< dim3( 3,3,3), dim3( 4,4,4) >> ( argument1, 20 );
    * \endcode
-   * The parameter types of the kernel and the parameters added at the cuda::Kernel class do not match.
+   * The parameter types of the kernel and the parameters added at the gpu::Kernel class do not match.
    * This is only detected when the code is run and was compiled in DEBUG mode!
    *
    *
    * Advantages of this class compared to kernel call syntax: Integrates nicely with waLBerlas field indexing and
    * accessor concepts:
    * \code
-         void kernel_func( cuda::SimpleFieldAccessor<double> f );
+         void kernel_func( gpu::SimpleFieldAccessor<double> f );
 
-         auto myKernel = cuda::make_kernel( &kernel_double );
-         myKernel.addFieldIndexingParam( cuda::SimpleFieldIndexing<double>::xyz( gpuField ) );
+         auto myKernel = gpu::make_kernel( &kernel_double );
+         myKernel.addFieldIndexingParam( gpu::SimpleFieldIndexing<double>::xyz( gpuField ) );
          myKernel();
    * \endcode
    * When using at least one FieldIndexingParameter configure() does not have to be called, since the thread and grid
@@ -104,7 +102,7 @@ namespace cuda {
 
 
       void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
-      void operator() ( cudaStream_t stream = 0 ) const;
+      void operator() ( gpuStream_t stream = nullptr ) const;
 
 
    protected:
@@ -113,10 +111,10 @@ namespace cuda {
       //@{
       FuncPtr funcPtr_;
 
-      bool configured_;
+      bool configured_{ false };
       dim3 gridDim_;
       dim3 blockDim_;
-      std::size_t sharedMemSize_;
+      std::size_t sharedMemSize_{ 0 };
 
       std::vector< std::vector<char> > params_;
       //@}
@@ -175,10 +173,7 @@ namespace cuda {
 
    template<typename FP>
    Kernel<FP>::Kernel( FP funcPtr )
-      : funcPtr_ ( funcPtr ),
-        configured_( false ),
-        sharedMemSize_( 0 )
-   {}
+      : funcPtr_ ( funcPtr ) {}
 
    template<typename FP>
    template<typename T>
@@ -189,7 +184,7 @@ namespace cuda {
       std::memcpy ( paramInfo.data(), &param, sizeof(T) );
 
       WALBERLA_ASSERT( checkParameter<T>( params_.size() ),
-                       "cuda::Kernel type mismatch of parameter " << params_.size()  );
+                       "gpu::Kernel type mismatch of parameter " << params_.size()  )
 
       params_.push_back( paramInfo );
    }
@@ -218,18 +213,18 @@ namespace cuda {
          if ( gridDim.x  != gridDim_.x  || gridDim.y != gridDim_.y   || gridDim.z != gridDim_.z ||
               blockDim.x != blockDim_.x || blockDim.y != blockDim_.y || blockDim.z != blockDim_.z  )
          {
-            WALBERLA_ABORT( "Error when configuring cuda::Kernel: Inconsistent setup. " );
+            WALBERLA_ABORT( "Error when configuring gpu::Kernel: Inconsistent setup. " )
          }
       }
    }
 
    template<typename FP>
-   void Kernel<FP>::operator() ( cudaStream_t stream ) const
+   void Kernel<FP>::operator() ( gpuStream_t stream ) const
    {
       // check for correct number of parameter calls
       if ( params_.size() != FunctionTraits<FuncType>::arity ) {
-         WALBERLA_ABORT( "Error when calling cuda::Kernel - Wrong number of arguments. " <<
-                         "Expected " << FunctionTraits<FuncType>::arity << ", received " << params_.size() );
+         WALBERLA_ABORT( "Error when calling gpu::Kernel - Wrong number of arguments. " <<
+                         "Expected " << FunctionTraits<FuncType>::arity << ", received " << params_.size() )
       }
 
       // register all parameters
@@ -241,7 +236,10 @@ namespace cuda {
       // .. and launch the kernel
       static_assert( sizeof(void *) == sizeof(void (*)(void)),
                      "object pointer and function pointer sizes must be equal" );
-      WALBERLA_CUDA_CHECK( cudaLaunchKernel( (void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream ) );
+      WALBERLA_DEVICE_SECTION()
+      {
+         WALBERLA_GPU_CHECK(gpuLaunchKernel((void*) funcPtr_, gridDim_, blockDim_, args.data(), sharedMemSize_, stream))
+      }
    }
 
 
@@ -259,7 +257,7 @@ namespace cuda {
          case 6: return checkParameter6<T>();
          case 7: return checkParameter7<T>();
          default:
-            WALBERLA_ABORT("Too many parameters passed to kernel");
+            WALBERLA_ABORT("Too many parameters passed to kernel")
       }
       return false;
    }
@@ -267,5 +265,5 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/gpu/NVTX.h b/src/gpu/NVTX.h
new file mode 100644
index 0000000000000000000000000000000000000000..86e3e6a3a4d1d21fd309eeaa2b32d9dc286a3442
--- /dev/null
+++ b/src/gpu/NVTX.h
@@ -0,0 +1,99 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NVTX.h
+//! \ingroup gpu
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+
+#include "DeviceWrapper.h"
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+  #include <nvToolsExt.h>
+  #include <nvToolsExtCuda.h>
+  #include <nvToolsExtCudaRt.h>
+  #include <string>
+#endif
+
+namespace walberla{
+namespace gpu
+{
+
+inline void nvtxMarker(const std::string& name, const uint32_t color=0xaaaaaa)
+{
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   nvtxEventAttributes_t eventAttrib;
+   memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+   eventAttrib.version       = NVTX_VERSION;
+   eventAttrib.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+   eventAttrib.colorType     = NVTX_COLOR_ARGB;
+   eventAttrib.color         = 0xFF000000 | color;
+   eventAttrib.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+   eventAttrib.message.ascii = name.c_str();
+   nvtxMarkEx(&eventAttrib);
+#else
+    WALBERLA_UNUSED(name);
+    WALBERLA_UNUSED(color);
+#endif
+}
+
+inline void nameStream(const cudaStream_t & stream, const std::string & name)
+{
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   nvtxNameCudaStreamA(stream, name.c_str());
+#else
+   WALBERLA_UNUSED(stream);
+   WALBERLA_UNUSED(name);
+#endif
+}
+
+class NvtxRange
+{
+public:
+    NvtxRange(const std::string & name, const uint32_t color=0xaaaaaa)
+    {
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      memset(&eventAttrib, 0, NVTX_EVENT_ATTRIB_STRUCT_SIZE);
+      eventAttrib.version       = NVTX_VERSION;
+      eventAttrib.size          = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+      eventAttrib.colorType     = NVTX_COLOR_ARGB;
+      eventAttrib.color         = 0xFF000000 | color;
+      eventAttrib.messageType   = NVTX_MESSAGE_TYPE_ASCII;
+      eventAttrib.message.ascii = name.c_str();
+      nvtxRangePushEx(&eventAttrib);
+#else
+      WALBERLA_UNUSED(name);
+      WALBERLA_UNUSED(color);
+#endif
+    }
+
+    ~NvtxRange()
+    {
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+         nvtxRangePop();
+#endif
+    }
+private:
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+    nvtxEventAttributes_t eventAttrib;
+#endif
+};
+
+
+} // namespace gpu
+} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ParallelStreams.cpp b/src/gpu/ParallelStreams.cpp
similarity index 54%
rename from src/cuda/ParallelStreams.cpp
rename to src/gpu/ParallelStreams.cpp
index d2fff04161673bbb720d359d89efd63a31d031b1..aed66f6932b48fcad2b2dcb945d3868382266c6a 100644
--- a/src/cuda/ParallelStreams.cpp
+++ b/src/gpu/ParallelStreams.cpp
@@ -14,47 +14,57 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file ParallelStreams.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/DeviceWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
-   ParallelSection::ParallelSection(ParallelStreams * parent, cudaStream_t mainStream)
+   ParallelSection::ParallelSection(ParallelStreams * parent, gpuStream_t mainStream)
      : parent_( parent ), mainStream_( mainStream ), counter_( 0 )
    {
-      WALBERLA_CUDA_CHECK( cudaEventCreate(&startEvent_) );
-      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+      WALBERLA_DEVICE_SECTION()
+      {
+         WALBERLA_GPU_CHECK(gpuEventCreate(&startEvent_))
+         WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_))
+      }
    }
 
    ParallelSection::~ParallelSection()
    {
-      synchronize();
-      WALBERLA_CUDA_CHECK( cudaEventDestroy(startEvent_) );
+      WALBERLA_DEVICE_SECTION()
+      {
+         synchronize();
+         WALBERLA_GPU_CHECK( gpuEventDestroy(startEvent_) )
+      }
    }
 
    void ParallelSection::next()
    {
-      if( counter_ > 0 ) {
-         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) );
-      }
-      else {
-         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->mainEvent_, mainStream_ ) );
-      }
-      ++counter_;
+      WALBERLA_DEVICE_SECTION()
+      {
+         if (counter_ > 0)
+         {
+            WALBERLA_GPU_CHECK(gpuEventRecord(parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1]))
+         }
+         else { WALBERLA_GPU_CHECK(gpuEventRecord(parent_->mainEvent_, mainStream_)) }
+         ++counter_;
 
-      parent_->ensureSize( counter_ );
+         parent_->ensureSize(counter_);
 
-      WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream(), startEvent_, 0 ));
+         WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream(), startEvent_, 0))
+      }
    }
 
-   void ParallelSection::run(const std::function<void( cudaStream_t)> & f)
+   void ParallelSection::run(const std::function<void(gpuStream_t)> & f)
    {
       f( stream() );
       next();
@@ -62,21 +72,23 @@ namespace cuda {
 
    void ParallelSection::synchronize()
    {
-      for( uint_t i=0; i < counter_; ++i )
-         for( uint_t j=0; j < counter_; ++j )
-         {
-            if( i == j )
-               continue;
+      WALBERLA_DEVICE_SECTION()
+      {
+         for (uint_t i = 0; i < counter_; ++i)
+            for (uint_t j = 0; j < counter_; ++j)
+            {
+               if (i == j) continue;
 
-            auto & event  = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
-            cudaStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
-            WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream, event, 0 ));
-         }
+               auto& event        = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
+               gpuStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
+               WALBERLA_GPU_CHECK(gpuStreamWaitEvent(stream, event, 0))
+            }
 
-      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+         WALBERLA_GPU_CHECK(gpuEventRecord(startEvent_, mainStream_))
+      }
    }
 
-   cudaStream_t ParallelSection::stream()
+   gpuStream_t ParallelSection::stream()
    {
       return counter_ == 0 ? mainStream_ : parent_->sideStreams_[counter_ - 1];
    }
@@ -88,7 +100,7 @@ namespace cuda {
    {
    }
 
-   ParallelSection ParallelStreams::parallelSection( cudaStream_t stream ) {
+   ParallelSection ParallelStreams::parallelSection( gpuStream_t stream ) {
       return ParallelSection(this, stream);
    }
 
@@ -96,7 +108,7 @@ namespace cuda {
       for( uint_t i = sideStreams_.size(); i < size; ++i )
       {
          sideStreams_.emplace_back( StreamRAII::newPriorityStream(streamPriority_));
-         events_.emplace_back( EventRAII() );
+         events_.emplace_back( );
       }
    }
 
@@ -109,5 +121,5 @@ namespace cuda {
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ParallelStreams.h b/src/gpu/ParallelStreams.h
similarity index 77%
rename from src/cuda/ParallelStreams.h
rename to src/gpu/ParallelStreams.h
index 4116e0ef971ccc4d08209e9f0a20fc2ced3878c9..0eca060569adf0e404c510f7847d9348f535df0f 100644
--- a/src/cuda/ParallelStreams.h
+++ b/src/gpu/ParallelStreams.h
@@ -14,18 +14,19 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file ParallelStreams.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 #pragma once
-#include "cuda/ErrorChecking.h"
-#include "cuda/CudaRAII.h"
-
 #include <vector>
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPURAII.h"
+
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
    class ParallelStreams;
 
@@ -33,32 +34,32 @@ namespace cuda {
    {
    public:
       ~ParallelSection();
-      void run( const std::function<void( cudaStream_t )> &f );
+      void run( const std::function<void( gpuStream_t )> &f );
 
-      cudaStream_t stream();
+      gpuStream_t stream();
       void next();
 
    private:
       friend class ParallelStreams;
 
-      ParallelSection( ParallelStreams *parent, cudaStream_t mainStream );
+      ParallelSection( ParallelStreams *parent, gpuStream_t mainStream );
       void synchronize();
 
       ParallelStreams * parent_;
-      cudaStream_t mainStream_;
-      cudaEvent_t startEvent_;
+      gpuStream_t mainStream_;
+      gpuEvent_t startEvent_;
       uint_t counter_;
    };
 
 
    //*******************************************************************************************************************
    /*!
-    * Helper class to run CUDA operations on parallel streams
+    * Helper class to run CUDA/HIP operations on parallel streams
     *
     * This class introduces "side streams" that overlap with one "main stream". In a parallel section, multiple
-    * kernels (or other CUDA operations) are scheduled to the streams. The first "run" is scheduled on the main stream
+    * kernels (or other CUDA/HIP operations) are scheduled to the streams. The first "run" is scheduled on the main stream
     * all subsequent operations on the side streams. The passed priority affects only the side streams. When
-    * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA events.
+    * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA/HIP events.
     *
     * Example:
     *
@@ -66,8 +67,8 @@ namespace cuda {
     * ParallelStreams streams;
     * {
     *   // new scope for the parallel section
-    *   ParallelSection sec = streams.parallelSection( mainCudaStream );
-    *   sec.run([&] ( cudaStream_t sideStream ) {
+    *   ParallelSection sec = streams.parallelSection( mainGPUStream );
+    *   sec.run([&] ( gpuStream_t sideStream ) {
     *       // run something on the side stream
     *   });
     *   // after the parallel section goes out of scope the side streams are synchronized to the main stream
@@ -81,7 +82,7 @@ namespace cuda {
    {
    public:
       ParallelStreams( int priority = 0 );
-      ParallelSection parallelSection( cudaStream_t stream );
+      ParallelSection parallelSection( gpuStream_t stream );
       void setStreamPriority( int priority );
 
    private:
@@ -96,5 +97,5 @@ namespace cuda {
    };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/CMakeLists.txt b/src/gpu/communication/CMakeLists.txt
similarity index 63%
rename from src/cuda/communication/CMakeLists.txt
rename to src/gpu/communication/CMakeLists.txt
index b1fe9c3492eb1e60040469cb1ada41559c1121dd..7b9c0cced315353be228779dc3e4dfc96764efc7 100644
--- a/src/cuda/communication/CMakeLists.txt
+++ b/src/gpu/communication/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     MemcpyPackInfo.h
     UniformGPUScheme.impl.h
@@ -7,5 +7,7 @@ target_sources( cuda
     GPUPackInfo.h
     CustomMemoryBuffer.h
     UniformGPUScheme.h
-    GeneratedGPUPackInfo.h     
+    NonUniformGPUScheme.h
+    GeneratedGPUPackInfo.h
+    GeneratedNonUniformGPUPackInfo.h
     )
diff --git a/src/cuda/communication/CustomMemoryBuffer.h b/src/gpu/communication/CustomMemoryBuffer.h
similarity index 90%
rename from src/cuda/communication/CustomMemoryBuffer.h
rename to src/gpu/communication/CustomMemoryBuffer.h
index 2caab2a41b13f5fb88c6e3052312e742778db408..e01e873708d84788fcecfb33a83ab3616b07c752 100644
--- a/src/cuda/communication/CustomMemoryBuffer.h
+++ b/src/gpu/communication/CustomMemoryBuffer.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file CustomMemoryBuffer.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \brief Basic Buffer supporting different memory spaces
 //
@@ -22,14 +22,15 @@
 
 #pragma once
 
-#include "cuda/ErrorChecking.h"
-
 #include <algorithm>
 #include <cstring>
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPUWrapper.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -40,7 +41,7 @@ namespace communication {
    /*!
     * Simple buffer class that supports memory allocators, e.g. for pinned host memory or GPU memory
     *
-    * \ingroup cuda
+    * \ingroup gpu
     *
     * In contrast to core::mpi::Buffer this class does not support stream operators "<<" and ">>" because these
     * operators imply serial (un)packing which is not feasible on the GPU.
@@ -61,7 +62,7 @@ namespace communication {
    class CustomMemoryBuffer
    {
    public:
-      typedef uint8_t ElementType;
+      using ElementType = uint8_t;
 
       explicit CustomMemoryBuffer();
       explicit CustomMemoryBuffer( std::size_t initSize );
@@ -73,6 +74,7 @@ namespace communication {
       inline std::size_t allocSize() const { return std::size_t(end_ - begin_); }
       inline std::size_t size() const { return std::size_t(cur_ - begin_); }
       ElementType *ptr() const { return begin_; }
+      ElementType *cur() const { return cur_; }
 
       inline void clear() { cur_ = begin_; }
 
@@ -100,13 +102,13 @@ namespace communication {
       static void *allocate( size_t size )
       {
          void *p;
-         WALBERLA_CUDA_CHECK( cudaMallocHost( &p, size ))
+         WALBERLA_GPU_CHECK( gpuMallocHost( &p, size ))
          return p;
       }
 
       static void deallocate( void *ptr )
       {
-         WALBERLA_CUDA_CHECK( cudaFreeHost( ptr ))
+         WALBERLA_GPU_CHECK( gpuFreeHost( ptr ))
       }
 
       static void memcpy( void *dst, void *src, size_t count )
@@ -120,24 +122,24 @@ namespace communication {
       static void *allocate( size_t size )
       {
          void *p;
-         WALBERLA_CUDA_CHECK( cudaMalloc( &p, size ))
+         WALBERLA_GPU_CHECK( gpuMalloc( &p, size ))
          return p;
       }
 
       static void deallocate( void *ptr )
       {
-         WALBERLA_CUDA_CHECK( cudaFree( ptr ))
+         WALBERLA_GPU_CHECK( gpuFree( ptr ))
       }
 
       static void memcpy( void *dst, void *src, size_t count )
       {
-         cudaMemcpy( dst, src, count, cudaMemcpyDeviceToDevice );
+         WALBERLA_GPU_CHECK( gpuMemcpy( dst, src, count, gpuMemcpyDeviceToHost ) )
       }
    };
 
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 #include "CustomMemoryBuffer.impl.h"
diff --git a/src/cuda/communication/CustomMemoryBuffer.impl.h b/src/gpu/communication/CustomMemoryBuffer.impl.h
similarity index 98%
rename from src/cuda/communication/CustomMemoryBuffer.impl.h
rename to src/gpu/communication/CustomMemoryBuffer.impl.h
index 21d70e4ccceac05de50c8ceea67e09b780e9fa38..ea354be200fe307b50b9396e889b74c2b17b8819 100644
--- a/src/cuda/communication/CustomMemoryBuffer.impl.h
+++ b/src/gpu/communication/CustomMemoryBuffer.impl.h
@@ -14,14 +14,15 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file CustomMemoryBuffer.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -118,5 +119,5 @@ namespace communication {
    }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/communication/GPUPackInfo.h b/src/gpu/communication/GPUPackInfo.h
similarity index 89%
rename from src/cuda/communication/GPUPackInfo.h
rename to src/gpu/communication/GPUPackInfo.h
index 661029b40dc1f39e55a20c7a708d19bc89da4cfa..c34600f29b2219088c29b0d5ff2e9fb1dc4a1142 100644
--- a/src/cuda/communication/GPUPackInfo.h
+++ b/src/gpu/communication/GPUPackInfo.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUPackInfo.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //======================================================================================================================
@@ -22,32 +22,33 @@
 #pragma once
 
 #include "blockforest/Block.h"
+
 #include "communication/UniformPackInfo.h"
+
 #include "core/debug/Debug.h"
 #include "core/math/Vector3.h"
 #include "core/mpi/BufferSizeTrait.h"
+
 #include "field/GhostRegions.h"
 #include "field/Layout.h"
-#include "stencil/Directions.h"
 
-#include "cuda/ErrorChecking.h"
-#include "cuda/GPUCopy.h"
-#include "cuda/communication/CustomMemoryBuffer.h"
+#include "stencil/Directions.h"
 
-#include <cuda_runtime.h>
 #include <map>
-#include <vector>
 #include <tuple>
+#include <vector>
 
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPUCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
 
-namespace walberla {
-namespace cuda {
-namespace communication {
+namespace walberla::gpu::communication {
 
 
 /**
- * Data packing/unpacking for ghost layer based communication of a cuda::GPUField
- * \ingroup cuda
+ * Data packing/unpacking for ghost layer based communication of a gpu::GPUField
+ * \ingroup gpu
  * Template Parameters:
  *    - GPUField_T   A fully qualified GPUField.
  */
@@ -78,7 +79,7 @@ public:
 
    void communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir);
 
-   void setCommunicationStream( cudaStream_t stream )
+   void setCommunicationStream( gpuStream_t stream )
    {
       if ( stream != 0 )
       {
@@ -96,7 +97,7 @@ protected:
    bool   communicateAllGhostLayers_;
    uint_t numberOfGhostLayers_;
    bool copyAsync_;
-   cudaStream_t communicationStream_;
+   gpuStream_t communicationStream_;
    std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_;
    mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_;
 };
@@ -106,7 +107,7 @@ template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer)
 {
    GPUField_T * fieldPtr = receiver->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
@@ -126,7 +127,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
       std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
    }
 
-   cudaStream_t & unpackStream = communicationStream_;
+   gpuStream_t & unpackStream = communicationStream_;
 
    auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
                                      uint_c(fieldCi.yMin() + nrOfGhostLayers),
@@ -156,7 +157,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( unpackStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( unpackStream ) );
    }
 }
 
@@ -167,13 +168,13 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    const GPUField_T * sf = sender  ->getData< GPUField_T >( bdId_ );
          GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ );
 
-   WALBERLA_ASSERT_NOT_NULLPTR( sf );
-   WALBERLA_ASSERT_NOT_NULLPTR( rf );
+   WALBERLA_ASSERT_NOT_NULLPTR( sf )
+   WALBERLA_ASSERT_NOT_NULLPTR( rf )
 
-   WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize());
-   WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize());
-   WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize());
-   WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize());
+   WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize())
+   WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize())
+   WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize())
+   WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize())
 
    WALBERLA_CHECK( sf->layout() == rf->layout(), "GPUPackInfo::communicateLocal: fields must have the same layout!" );
 
@@ -182,7 +183,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false );
    CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false );
 
-   cudaStream_t & commStream = communicationStream_;
+   gpuStream_t & commStream = communicationStream_;
 
    auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers),
                                      uint_c(rCi.yMin() + nrOfGhostLayers),
@@ -217,7 +218,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( commStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( commStream ) )
    }
 }
 
@@ -226,7 +227,7 @@ template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const
 {
    const GPUField_T * fieldPtr = sender->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
@@ -236,7 +237,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
 
    unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack );
 
-   const cudaStream_t & packStream = communicationStream_;
+   const gpuStream_t & packStream = communicationStream_;
 
    unsigned char * copyBufferPtr = outBufferPtr;
    if ( copyAsync_ )
@@ -274,7 +275,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
 
    if ( copyAsync_ )
    {
-      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( packStream ) );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( packStream ) )
 
       std::copy( copyBufferPtr, static_cast<unsigned char *>( copyBufferPtr + nrOfBytesToPack ), outBufferPtr );
    }
@@ -290,13 +291,9 @@ uint_t GPUPackInfo<GPUField_T>::numberOfGhostLayersToCommunicate( const GPUField
    }
    else
    {
-      WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() );
+      WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() )
       return numberOfGhostLayers_;
    }
 }
 
-
-
-} // namespace communication
-} // namespace cuda
-} // namespace walberla
+} // namespace walberla::gpu::communication
diff --git a/src/cuda/communication/GeneratedGPUPackInfo.h b/src/gpu/communication/GeneratedGPUPackInfo.h
similarity index 77%
rename from src/cuda/communication/GeneratedGPUPackInfo.h
rename to src/gpu/communication/GeneratedGPUPackInfo.h
index 752f2907c734cbb1a18d73b390424d33c94f3aa8..f5f6c98b60b529045a1877a435fcacacb9359a95 100644
--- a/src/cuda/communication/GeneratedGPUPackInfo.h
+++ b/src/gpu/communication/GeneratedGPUPackInfo.h
@@ -14,31 +14,30 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GeneratedGPUPackInfo.h
-//! \ingroup core
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
-
 #pragma once
-#include "stencil/Directions.h"
 #include "domain_decomposition/IBlock.h"
-#include <cuda_runtime.h>
 
+#include "gpu/GPUWrapper.h"
 
-namespace walberla {
-namespace cuda {
+#include "stencil/Directions.h"
 
+namespace walberla::gpu {
 
 class GeneratedGPUPackInfo
 {
 public:
-   virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
-   virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
+  GeneratedGPUPackInfo() = default;
+  virtual ~GeneratedGPUPackInfo() = default;
+
+   virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
+   virtual void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) = 0;
+   virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, gpuStream_t stream ) = 0;
    virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
 };
 
-
-
-} //namespace cuda
-} //namespace walberla
\ No newline at end of file
+} //namespace walberla::gpu
\ No newline at end of file
diff --git a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6b39d9b0fe0dd1b9c90c5d63eb7b8ca00bd3d0f
--- /dev/null
+++ b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h
@@ -0,0 +1,159 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GeneratedNonUniformGPUPackInfo.h
+//! \ingroup gpu
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/Block.h"
+#include "blockforest/BlockID.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
+
+#include "stencil/Directions.h"
+
+using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer;
+
+
+namespace walberla::gpu {
+
+
+class GeneratedNonUniformGPUPackInfo
+{
+ public:
+   using VoidFunction                  = std::function< void( gpuStream_t) >;
+   GeneratedNonUniformGPUPackInfo() = default;
+   virtual ~GeneratedNonUniformGPUPackInfo() = default;
+
+   virtual bool constantDataExchange() const = 0;
+   virtual bool threadsafeReceiving() const = 0;
+
+   inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer) const;
+   virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+   virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir, gpuStream_t stream) = 0;
+   virtual void getLocalEqualLevelCommFunction( std::vector< VoidFunction >& commFunctions, const Block * sender, Block * receiver, stencil::Direction dir) = 0;
+
+   inline  void packDataCoarseToFine        ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const;
+   virtual void unpackDataCoarseToFine      (       Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+   virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir ) = 0;
+   virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
+   virtual void getLocalCoarseToFineCommFunction( std::vector< VoidFunction >& commFunctions, const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+
+   inline  void packDataFineToCoarse        ( const Block * fineSender,     const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const;
+   virtual void unpackDataFineToCoarse      (       Block * coarseReceiver, const BlockID & fineSender,     stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+   virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir) = 0;
+   virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
+   virtual void getLocalFineToCoarseCommFunction( std::vector< VoidFunction >& commFunctions, const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+
+   virtual uint_t sizeEqualLevelSend( const Block * sender, stencil::Direction dir) = 0;
+   virtual uint_t sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir) = 0;
+   virtual uint_t sizeFineToCoarseSend ( const Block * fineSender, stencil::Direction dir) = 0;
+
+
+#ifndef NDEBUG
+   void clearBufferSizeCheckMap() { bufferSize_.clear(); }
+#endif
+
+ protected:
+   virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
+   virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
+   virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
+
+#ifndef NDEBUG
+   mutable std::map< const Block *, std::map< stencil::Direction, std::map< uint_t, size_t > > > bufferSize_;
+#endif
+
+};
+
+inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer ) const
+{
+#ifndef NDEBUG
+   size_t const sizeBefore = buffer.size();
+#endif
+
+   packDataEqualLevelImpl( sender, dir, buffer );
+
+#ifndef NDEBUG
+size_t const sizeAfter = buffer.size();
+if( constantDataExchange() )
+{
+      auto & blockMap = bufferSize_[ sender ];
+      auto & sizeMap  = blockMap[ dir ];
+      auto dirEntry = sizeMap.find( uint_t(0) );
+      if( dirEntry == sizeMap.end() )
+         sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore;
+      else
+         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) )
+}
+#endif
+}
+
+
+
+inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const
+{
+#ifndef NDEBUG
+   size_t const sizeBefore = buffer.size();
+#endif
+
+   packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer );
+
+#ifndef NDEBUG
+size_t const sizeAfter = buffer.size();
+if( constantDataExchange() )
+{
+      auto & blockMap = bufferSize_[ coarseSender ];
+      auto & sizeMap  = blockMap[ dir ];
+      auto dirEntry = sizeMap.find( fineReceiver.getBranchId() );
+      if( dirEntry == sizeMap.end() )
+         sizeMap[ fineReceiver.getBranchId() ] = sizeAfter - sizeBefore;
+      else
+         WALBERLA_ASSERT_EQUAL( sizeMap[ fineReceiver.getBranchId() ], (sizeAfter - sizeBefore) )
+}
+#endif
+}
+
+
+
+inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const
+{
+#ifndef NDEBUG
+   size_t const sizeBefore = buffer.size();
+#endif
+
+   packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer );
+
+#ifndef NDEBUG
+size_t const sizeAfter = buffer.size();
+if( constantDataExchange() )
+{
+      auto & blockMap = bufferSize_[ fineSender ];
+      auto & sizeMap  = blockMap[ dir ];
+      auto dirEntry = sizeMap.find( uint_t(0) );
+      if( dirEntry == sizeMap.end() )
+         sizeMap[ uint_t(0) ] = sizeAfter - sizeBefore;
+      else
+         WALBERLA_ASSERT_EQUAL( sizeMap[ uint_t(0) ], (sizeAfter - sizeBefore) )
+}
+#endif
+}
+
+
+} //namespace walberla::gpu
\ No newline at end of file
diff --git a/src/gpu/communication/MemcpyPackInfo.h b/src/gpu/communication/MemcpyPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c15988f4f2687275fea7f0f8be36b2e7d99fcf6
--- /dev/null
+++ b/src/gpu/communication/MemcpyPackInfo.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
+#include "domain_decomposition/IBlock.h"
+
+#include "stencil/Directions.h"
+
+#include "gpu/GPUField.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
+namespace walberla::gpu::communication {
+
+template<typename GPUFieldType>
+class MemcpyPackInfo : public ::walberla::gpu::GeneratedGPUPackInfo
+{
+public:
+    MemcpyPackInfo( BlockDataID pdfsID_ ) : pdfsID(pdfsID_) {};
+    ~MemcpyPackInfo() override = default;
+
+    void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
+    void communicateLocal  ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override;
+    void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, gpuStream_t stream) override;
+    uint_t size(stencil::Direction dir, IBlock * block) override;
+
+private:
+    BlockDataID pdfsID;
+    uint_t numberOfGhostLayers_{0};
+    bool communicateAllGhostLayers_{true};
+
+    uint_t numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const;
+};
+
+} // namespace walberla::gpu::communication
+
+#include "MemcpyPackInfo.impl.h"
diff --git a/src/cuda/communication/MemcpyPackInfo.impl.h b/src/gpu/communication/MemcpyPackInfo.impl.h
similarity index 66%
rename from src/cuda/communication/MemcpyPackInfo.impl.h
rename to src/gpu/communication/MemcpyPackInfo.impl.h
index b75587c5bcdcef1ce38e06f58db53339095ce7f8..2110933cda5322828f40cc14b471be5c6a309bfe 100644
--- a/src/cuda/communication/MemcpyPackInfo.impl.h
+++ b/src/gpu/communication/MemcpyPackInfo.impl.h
@@ -3,27 +3,27 @@
 #include "field/Layout.h"
 #include "stencil/Directions.h"
 #include "core/cell/CellInterval.h"
-#include "cuda/GPUField.h"
-#include "cuda/GPUCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/GPUCopy.h"
 #include "core/DataTypes.h"
 #include "MemcpyPackInfo.h"
-#include <cuda_runtime.h>
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char * byte_buffer,
-                                          IBlock * block, cudaStream_t stream)
+                                          IBlock * block, gpuStream_t stream)
 {
    // Extract field data pointer from the block
    const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
    WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr )
    // 
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
-   CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false );
+   CellInterval fieldCi = field::getSliceBeforeGhostLayer( *fieldPtr, dir, nrOfGhostLayers, false );
 
    // Base offsets into the buffer and GPUField, respectively
    auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
@@ -41,7 +41,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
       const uint_t dstAllocSizeZ = fieldCi.zSize();
       const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.ySize() );
@@ -55,7 +55,7 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
       const uint_t dstAllocSizeZ = fieldCi.ySize();
       const uint_t srcAllocSizeZ = fieldPtr->yAllocSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() );
@@ -65,9 +65,68 @@ void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char
    }
 }
 
+template<typename GPUFieldType>
+void MemcpyPackInfo< GPUFieldType >::communicateLocal( stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream )
+{
+   // WALBERLA_ABORT("The MemcpyPackInfo does not provide a thread safe local communication. Thus is can not be used in local mode. To use it set local useLocalCommunication to false in the communication scheme")
+
+
+   // Extract field data pointer from the block
+   const GPUFieldType * senderFieldPtr = sender->getData< GPUFieldType >( pdfsID );
+   const GPUFieldType * receiverFieldPtr = receiver->getData< GPUFieldType >( pdfsID );
+   WALBERLA_ASSERT_NOT_NULLPTR( senderFieldPtr )
+   WALBERLA_ASSERT_NOT_NULLPTR( receiverFieldPtr )
+
+   //
+   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( senderFieldPtr ) );
+   WALBERLA_ASSERT_EQUAL(nrOfGhostLayers, cell_idx_c( numberOfGhostLayersToCommunicate( receiverFieldPtr )))
+   WALBERLA_ASSERT_EQUAL(senderFieldPtr->layout(), receiverFieldPtr->layout() )
+   WALBERLA_ASSERT_EQUAL(senderFieldPtr->fSize(), receiverFieldPtr->fSize() )
+
+   CellInterval senderCi = field::getSliceBeforeGhostLayer( *senderFieldPtr, dir, nrOfGhostLayers, false );
+   CellInterval receiverCi = field::getGhostRegion( *receiverFieldPtr, stencil::inverseDir[dir], nrOfGhostLayers, false );
+
+   // Base offsets into the buffer and GPUField, respectively
+   auto srcOffset = std::make_tuple( uint_c(senderCi.xMin() + nrOfGhostLayers),
+                                     uint_c(senderCi.yMin() + nrOfGhostLayers),
+                                     uint_c(senderCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+   auto dstOffset = std::make_tuple( uint_c(receiverCi.xMin() + nrOfGhostLayers),
+                                     uint_c(receiverCi.yMin() + nrOfGhostLayers),
+                                     uint_c(receiverCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+
+   // Size of data to pack, in terms of elements of the field
+   auto intervalSize = std::make_tuple( senderCi.xSize(), senderCi.ySize(),
+                                        senderCi.zSize(), senderFieldPtr->fSize() );
+
+   WALBERLA_ASSERT_EQUAL(intervalSize, std::make_tuple( receiverCi.xSize(), receiverCi.ySize(), receiverCi.zSize(), receiverFieldPtr->fSize() ))
+
+   if ( senderFieldPtr->layout() == field::fzyx )
+   {
+      const uint_t dstAllocSizeZ = receiverFieldPtr->zAllocSize();
+      const uint_t srcAllocSizeZ = senderFieldPtr->zAllocSize();
+
+      copyDevToDevFZYX( receiverFieldPtr->pitchedPtr(), senderFieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                       dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type),
+                       intervalSize, stream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeZ = receiverFieldPtr->yAllocSize();
+      const uint_t srcAllocSizeZ = senderFieldPtr->yAllocSize();
+
+      copyDevToDevZYXF( receiverFieldPtr->pitchedPtr(), senderFieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                       dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type),
+                       intervalSize, stream );
+   }
+}
+
 template<typename GPUFieldType>
 void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer,
-                                            IBlock * block, cudaStream_t stream)
+                                            IBlock * block, gpuStream_t stream)
 {
    GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID );
    WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr)
@@ -75,7 +134,6 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
    CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false );
-
    auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
                                      uint_c(fieldCi.yMin() + nrOfGhostLayers),
                                      uint_c(fieldCi.zMin() + nrOfGhostLayers),
@@ -89,7 +147,7 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
       const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
       const uint_t srcAllocSizeZ = fieldCi.zSize();
 
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.ySize() );
@@ -102,7 +160,7 @@ void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned cha
    {
       const uint_t dstAllocSizeY = fieldPtr->yAllocSize();
       const uint_t srcAllocSizeY = fieldCi.ySize();
-      cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer,
+      gpuPitchedPtr byteBufferPitchedPtr = make_gpuPitchedPtr( byte_buffer,
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type),
                                                                  fieldCi.xSize() );
@@ -208,7 +266,7 @@ uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * blo
 
     return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type);
     */
-    uint_t totalCells = ci.xSize() * ci.ySize() * ci.zSize() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type);
+    uint_t totalCells = ci.numCells() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type);
     return totalCells;
 }
 
@@ -227,5 +285,5 @@ uint_t MemcpyPackInfo< GPUFieldType >::numberOfGhostLayersToCommunicate( const G
 }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
\ No newline at end of file
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b9576434d52dc8726daa35ab91a32fd8984a780
--- /dev/null
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -0,0 +1,973 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGPUScheme.h
+//! \ingroup gpu
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/mpi/BufferSystem.h"
+#include "core/mpi/MPIWrapper.h"
+
+#include "domain_decomposition/IBlock.h"
+
+#include "stencil/Directions.h"
+
+#include <thread>
+
+#include "gpu/ErrorChecking.h"
+#include "gpu/GPURAII.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
+#include "gpu/communication/GeneratedNonUniformGPUPackInfo.h"
+
+namespace walberla::gpu::communication
+{
+
+template< typename Stencil >
+class NonUniformGPUScheme
+{
+ public:
+   enum INDEX { EQUAL_LEVEL = 0, COARSE_TO_FINE = 1, FINE_TO_COARSE = 2 };
+
+   using CpuBuffer_T = walberla::gpu::communication::PinnedMemoryBuffer;
+   using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer;
+
+   explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU = false,
+                                const int tag = 5432);
+
+   explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, const Set< SUID >& requiredBlockSelectors,
+                                const Set< SUID >& incompatibleBlockSelectors, bool sendDirectlyFromGPU = false,
+                                const int tag = 5432);
+
+   ~NonUniformGPUScheme();
+
+   //** Pack Info Registration *****************************************************************************************
+   /*! \name Pack Info Registration */
+   //@{
+   void addPackInfo(const shared_ptr< GeneratedNonUniformGPUPackInfo >& pi);
+   //@}
+   //*******************************************************************************************************************
+
+   inline void communicateEqualLevel(const uint_t level);
+   inline void communicateCoarseToFine(const uint_t fineLevel);
+   inline void communicateFineToCoarse(const uint_t fineLevel);
+
+   std::function<void()>  communicateEqualLevelFunctor(const uint_t level) {
+      return [level, this](){ NonUniformGPUScheme::communicateEqualLevel(level);};
+   }
+   std::function<void()>  communicateCoarseToFineFunctor(const uint_t fineLevel) {
+      return [fineLevel, this](){ NonUniformGPUScheme::communicateCoarseToFine(fineLevel);};
+   }
+   std::function<void()>  communicateFineToCoarseFunctor(const uint_t fineLevel) {
+      return [fineLevel, this](){ NonUniformGPUScheme::communicateFineToCoarse(fineLevel);};
+   }
+
+   inline void startCommunicateEqualLevel(const uint_t level);
+   inline void startCommunicateCoarseToFine(const uint_t fineLevel);
+   inline void startCommunicateFineToCoarse(const uint_t fineLevel);
+
+   inline void waitCommunicateEqualLevel(const uint_t level);
+   inline void waitCommunicateCoarseToFine(const uint_t fineLevel);
+   inline void waitCommunicateFineToCoarse(const uint_t fineLevel);
+
+ private:
+   void setupCommunication();
+
+   void init();
+   void refresh();
+
+   bool isAnyCommunicationInProgress() const;
+
+   void startCommunicationEqualLevel(const uint_t index, std::set< uint_t >& participatingLevels);
+   void startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel);
+   void startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel);
+
+   weak_ptr< StructuredBlockForest > blockForest_;
+   uint_t forestModificationStamp_;
+
+   std::vector< std::vector< bool > > communicationInProgress_;
+   bool sendFromGPU_;
+   int baseTag_;
+
+   std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_;
+   std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_;
+   std::vector< std::vector< GpuBuffer_T > > localBuffer_;
+
+   std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_;
+
+   ParallelStreams parallelSectionManager_;
+
+   struct Header
+   {
+      BlockID receiverId;
+      BlockID senderId;
+      stencil::Direction dir;
+   };
+   std::vector< std::vector< std::map< mpi::MPIRank, std::vector< Header > > > > headers_;
+
+   Set< SUID > requiredBlockSelectors_;
+   Set< SUID > incompatibleBlockSelectors_;
+};
+
+template< typename Stencil >
+NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU,
+                                                    const int tag)
+   : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), parallelSectionManager_(-1),
+     requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet())
+{
+   WALBERLA_MPI_SECTION()
+   {
+#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+      WALBERLA_CHECK(!sendDirectlyFromGPU)
+#endif
+   }
+   init();
+}
+
+template< typename Stencil >
+NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf,
+                                                    const Set< SUID >& requiredBlockSelectors,
+                                                    const Set< SUID >& incompatibleBlockSelectors,
+                                                    bool sendDirectlyFromGPU, const int tag)
+   : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors),
+     incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag),
+     parallelSectionManager_(-1)
+{
+   WALBERLA_MPI_SECTION()
+   {
+#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+      WALBERLA_CHECK(!sendDirectlyFromGPU)
+#endif
+   }
+   init();
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::init()
+{
+   bufferSystemCPU_.resize(3);
+   bufferSystemGPU_.resize(3);
+   localBuffer_.resize(3);
+   headers_.resize(3);
+
+   communicationInProgress_.resize(3);
+
+   refresh();
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::refresh()
+{
+   WALBERLA_ASSERT(!isAnyCommunicationInProgress())
+
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   const uint_t levels = forest->getNumberOfLevels();
+
+   for (uint_t i = 0; i != 3; ++i)
+   {
+      bufferSystemCPU_[i].clear();
+      bufferSystemGPU_[i].clear();
+      localBuffer_[i].clear();
+      headers_[i].clear();
+      headers_[i].resize(size_t(levels + uint_t(1)));
+
+      for (uint_t j = 0; j <= levels; ++j)
+      {
+         headers_[i][j].clear();
+         bufferSystemCPU_[i].emplace_back(
+            mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
+         bufferSystemGPU_[i].emplace_back(
+            mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
+         localBuffer_[i].emplace_back();
+      }
+
+      communicationInProgress_[i].resize(size_t(levels + uint_t(1)), false);
+   }
+
+#ifndef NDEBUG
+   for (auto p = packInfos_.begin(); p != packInfos_.end(); ++p)
+      (*p)->clearBufferSizeCheckMap();
+#endif
+
+   forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::communicateEqualLevel(const uint_t level)
+{
+   startCommunicateEqualLevel(level);
+   waitCommunicateEqualLevel(level);
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::communicateCoarseToFine(const uint_t fineLevel)
+{
+   startCommunicateCoarseToFine(fineLevel);
+   waitCommunicateCoarseToFine(fineLevel);
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::communicateFineToCoarse(const uint_t fineLevel)
+{
+   startCommunicateFineToCoarse(fineLevel);
+   waitCommunicateFineToCoarse(fineLevel);
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::startCommunicateEqualLevel(const uint_t level)
+{
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_LESS(level, forest->getNumberOfLevels())
+
+   if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh();
+
+   std::set< uint_t > participatingLevels;
+   participatingLevels.insert(level);
+
+   startCommunicationEqualLevel(level, participatingLevels);
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::startCommunicateCoarseToFine(const uint_t fineLevel)
+{
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0))
+   WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
+
+   if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh();
+
+   const uint_t coarsestLevel = fineLevel - uint_t(1);
+
+   startCommunicationCoarseToFine(fineLevel, coarsestLevel);
+}
+
+template< typename Stencil >
+inline void NonUniformGPUScheme< Stencil >::startCommunicateFineToCoarse(const uint_t fineLevel)
+{
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0))
+   WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
+
+   if (forestModificationStamp_ != forest->getBlockForest().getModificationStamp()) refresh();
+
+   const uint_t finestLevel   = fineLevel;
+
+   startCommunicationFineToCoarse(fineLevel, finestLevel);
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t index,
+                                                                  std::set< uint_t >& participatingLevels)
+{
+   if (packInfos_.empty()) return;
+
+   WALBERLA_ASSERT(!communicationInProgress_[EQUAL_LEVEL][index])
+   communicationInProgress_[EQUAL_LEVEL][index] = true;
+
+   auto forest = blockForest_.lock();
+
+   // Schedule Receives
+   if (sendFromGPU_)
+      bufferSystemGPU_[EQUAL_LEVEL][index].scheduleReceives();
+   else
+      bufferSystemCPU_[EQUAL_LEVEL][index].scheduleReceives();
+
+   if (!sendFromGPU_)
+      for (auto it : headers_[EQUAL_LEVEL][index])
+         bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear();
+
+   // Start filling send buffers
+   {
+      for (auto& iBlock : *forest)
+      {
+         auto senderBlock = dynamic_cast< Block* >(&iBlock);
+
+         if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+            continue;
+
+         if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end())
+            continue;
+
+         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+
+            if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
+               continue;
+            WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+            if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_))
+               continue;
+
+            if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+            {
+               auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
+               for (auto& pi : packInfos_)
+               {
+                  pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, nullptr);
+               }
+            }
+            else
+            {
+               auto nProcess              = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0)));
+               GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess);
+
+               for (auto& pi : packInfos_)
+               {
+                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeEqualLevelSend(senderBlock, *dir))
+
+                  pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer);
+
+                  if (!sendFromGPU_)
+                  {
+                     auto gpuDataPtr = gpuDataBuffer.cur();
+                     auto size = pi->sizeEqualLevelSend(senderBlock, *dir);
+                     auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size);
+                     WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   // wait for packing to finish
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+
+
+   if (sendFromGPU_)
+      bufferSystemGPU_[EQUAL_LEVEL][index].sendAll();
+   else
+      bufferSystemCPU_[EQUAL_LEVEL][index].sendAll();
+
+   communicationInProgress_[EQUAL_LEVEL][index] = true;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel)
+{
+   if (packInfos_.empty()) return;
+   WALBERLA_ASSERT(!communicationInProgress_[COARSE_TO_FINE][index])
+   communicationInProgress_[COARSE_TO_FINE][index] = true;
+
+   auto forest = blockForest_.lock();
+
+   // Schedule Receives
+   if (sendFromGPU_)
+      bufferSystemGPU_[COARSE_TO_FINE][index].scheduleReceives();
+   else
+      bufferSystemCPU_[COARSE_TO_FINE][index].scheduleReceives();
+
+   if (!sendFromGPU_)
+      for (auto it : headers_[COARSE_TO_FINE][index])
+         bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
+
+   // Start filling send buffers
+   {
+      for (auto& iBlock : *forest)
+      {
+         auto coarseBlock = dynamic_cast< Block* >(&iBlock);
+         auto nLevel      = coarseBlock->getLevel();
+
+         if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+            continue;
+
+         if (nLevel != coarsestLevel) continue;
+
+         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+
+            if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+            if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue;
+
+            for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n)
+            {
+               const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n);
+               if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_,
+                                              incompatibleBlockSelectors_))
+                  continue;
+
+               if( coarseBlock->neighborExistsLocally( neighborIdx, n ) )
+               {
+                  auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) );
+                  //                  for (auto& pi : packInfos_)
+                  //                  {
+                  //                     pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir);
+                  //                  }
+
+                  GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index];
+                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+
+                  for (auto& pi : packInfos_)
+                  {
+                     WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
+                     pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr);
+                  }
+               }
+               else
+               {
+                  auto nProcess              = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n));
+                  GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess);
+                  for (auto& pi : packInfos_)
+                  {
+                     WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+                     WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
+
+                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer);
+
+                     if (!sendFromGPU_)
+                     {
+                        auto gpuDataPtr = gpuDataBuffer.cur();
+                        auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir);
+                        auto cpuDataPtr =
+                           bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size);
+                        WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                        WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
+                     }
+                  }
+               }
+            }
+         }
+         localBuffer_[COARSE_TO_FINE][index].clear();
+      }
+   }
+
+   // wait for packing to finish
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+
+   if (sendFromGPU_)
+      bufferSystemGPU_[COARSE_TO_FINE][index].sendAll();
+   else
+      bufferSystemCPU_[COARSE_TO_FINE][index].sendAll();
+
+   communicationInProgress_[COARSE_TO_FINE][index] = true;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel)
+{
+   if (packInfos_.empty()) return;
+
+   WALBERLA_ASSERT(!communicationInProgress_[FINE_TO_COARSE][index])
+
+   communicationInProgress_[FINE_TO_COARSE][index] = true;
+
+   auto forest = blockForest_.lock();
+
+   // Schedule Receives
+   if (sendFromGPU_)
+      bufferSystemGPU_[FINE_TO_COARSE][index].scheduleReceives();
+   else
+      bufferSystemCPU_[FINE_TO_COARSE][index].scheduleReceives();
+
+   if (!sendFromGPU_)
+      for (auto it : headers_[FINE_TO_COARSE][index])
+         bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
+
+   // Start filling send buffers
+   {
+      for (auto& iBlock : *forest)
+      {
+         auto fineBlock = dynamic_cast< Block* >(&iBlock);
+         auto nLevel    = fineBlock->getLevel();
+
+         if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+            continue;
+
+         if (nLevel != finestLevel) continue;
+
+         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+
+            if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+            if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue;
+            WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+
+            const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0));
+            if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
+                                           incompatibleBlockSelectors_))
+               continue;
+            if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+            {
+               auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) );
+               //               for (auto& pi : packInfos_)
+               //               {
+               //                  pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir);
+               //               }
+
+               GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index];
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               for (auto& pi : packInfos_)
+               {
+                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
+                  pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr);
+               }
+            }
+            else
+            {
+               auto nProcess              = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0)));
+               GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess);
+
+               for (auto& pi : packInfos_)
+               {
+                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
+
+                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer);
+
+                  if (!sendFromGPU_)
+                  {
+                     auto gpuDataPtr = gpuDataBuffer.cur();
+                     auto size = pi->sizeFineToCoarseSend(fineBlock, *dir);
+                     auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size);
+                     WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
+                  }
+               }
+            }
+         }
+         localBuffer_[FINE_TO_COARSE][index].clear();
+      }
+   }
+
+   // wait for packing to finish
+   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+
+   if (sendFromGPU_)
+      bufferSystemGPU_[FINE_TO_COARSE][index].sendAll();
+   else
+      bufferSystemCPU_[FINE_TO_COARSE][index].sendAll();
+
+   communicationInProgress_[FINE_TO_COARSE][index] = true;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t level)
+{
+   if (!communicationInProgress_[EQUAL_LEVEL][level] || packInfos_.empty()) return;
+
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_LESS(level, forest->getNumberOfLevels())
+
+   if (sendFromGPU_)
+   {
+      // auto parallelSection = parallelSectionManager_.parallelSection( nullptr );
+      for (auto recvInfo = bufferSystemGPU_[EQUAL_LEVEL][level].begin();
+           recvInfo != bufferSystemGPU_[EQUAL_LEVEL][level].end(); ++recvInfo)
+      {
+         recvInfo.buffer().clear();
+         for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()])
+         {
+            auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+
+            for (auto& pi : packInfos_)
+            {
+               GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               // parallelSection.run([&](auto s) {
+               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer);
+               // });
+            }
+         }
+      }
+   }
+   else
+   {
+      for (auto recvInfo = bufferSystemCPU_[EQUAL_LEVEL][level].begin();
+           recvInfo != bufferSystemCPU_[EQUAL_LEVEL][level].end(); ++recvInfo)
+      {
+         auto& gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank());
+
+         recvInfo.buffer().clear();
+         gpuBuffer.clear();
+         for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()])
+         {
+            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
+
+            for (auto& pi : packInfos_)
+            {
+               auto size       = pi->sizeEqualLevelSend(senderBlock, header.dir);
+               auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
+               auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
+               WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
+
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr))
+               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer);
+            }
+         }
+      }
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   }
+   communicationInProgress_[EQUAL_LEVEL][level] = false;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fineLevel)
+{
+   if (!communicationInProgress_[COARSE_TO_FINE][fineLevel] || packInfos_.empty()) return;
+
+   WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0))
+
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
+
+   if (sendFromGPU_)
+   {
+      // auto parallelSection = parallelSectionManager_.parallelSection( nullptr );
+      for (auto recvInfo = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].begin();
+           recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo)
+      {
+         recvInfo.buffer().clear();
+         for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()])
+         {
+            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
+
+            for (auto& pi : packInfos_)
+            {
+               // auto size = pi->sizeCoarseToFineSend( senderBlock, block->getId(), header.dir );
+               GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               // parallelSection.run([&](auto s) {
+               pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer);
+               // });
+            }
+         }
+      }
+   }
+   else
+   {
+      auto parallelSection = parallelSectionManager_.parallelSection(nullptr);
+      for (auto recvInfo = bufferSystemCPU_[COARSE_TO_FINE][fineLevel].begin();
+           recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo)
+      {
+         auto& gpuBuffer = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].sendBuffer(recvInfo.rank());
+
+         recvInfo.buffer().clear();
+         gpuBuffer.clear();
+         for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()])
+         {
+            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
+
+            for (auto& pi : packInfos_)
+            {
+               auto size       = pi->sizeCoarseToFineSend(senderBlock, block->getId(), header.dir);
+               auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
+               auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
+               WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
+
+               parallelSection.run([&](auto s) {
+                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s))
+                  pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer);
+               });
+            }
+         }
+      }
+   }
+   communicationInProgress_[COARSE_TO_FINE][fineLevel] = false;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fineLevel)
+{
+   if (!communicationInProgress_[FINE_TO_COARSE][fineLevel] || packInfos_.empty()) return;
+
+   WALBERLA_ASSERT_GREATER(fineLevel, uint_t(0))
+
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
+   // WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
+
+   if (sendFromGPU_)
+   {
+      // auto parallelSection = parallelSectionManager_.parallelSection( nullptr );
+      for (auto recvInfo = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].begin();
+           recvInfo != bufferSystemGPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo)
+      {
+         recvInfo.buffer().clear();
+         for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()])
+         {
+            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
+
+            for (auto& pi : packInfos_)
+            {
+               GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               // parallelSection.run([&](auto s) {
+               pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer);
+               // });
+            }
+         }
+      }
+   }
+   else
+   {
+      auto parallelSection = parallelSectionManager_.parallelSection(nullptr);
+      for (auto recvInfo = bufferSystemCPU_[FINE_TO_COARSE][fineLevel].begin();
+           recvInfo != bufferSystemCPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo)
+      {
+         auto& gpuBuffer = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].sendBuffer(recvInfo.rank());
+
+         recvInfo.buffer().clear();
+         gpuBuffer.clear();
+         for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()])
+         {
+            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
+            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
+
+            for (auto& pi : packInfos_)
+            {
+               auto size       = pi->sizeFineToCoarseSend(senderBlock, header.dir);
+               auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
+               auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
+               WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
+
+               parallelSection.run([&](auto s) {
+                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s))
+                  pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer);
+               });
+            }
+         }
+      }
+   }
+   communicationInProgress_[FINE_TO_COARSE][fineLevel] = false;
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::setupCommunication()
+{
+   WALBERLA_ASSERT_GREATER(packInfos_.size(), uint_c(0),
+                           "You have not registered a packInfo yet, thus setupCommunication does not work yet.")
+   auto forest = blockForest_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(forest,
+                              "Trying to access communication for a block storage object that doesn't exist anymore")
+   const uint_t levels = forest->getNumberOfLevels();
+
+   std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > >
+      receiverInfo; // how many bytes to send to each neighbor
+   std::vector< std::vector< mpi::BufferSystem > > headerExchangeBs;
+
+   receiverInfo.resize(3);
+   receiverInfo[EQUAL_LEVEL].resize(levels + uint_c(1));
+   receiverInfo[COARSE_TO_FINE].resize(levels + uint_c(1));
+   receiverInfo[FINE_TO_COARSE].resize(levels + uint_c(1));
+
+   std::vector< std::vector< mpi::MPISize > > localBufferSize;
+
+   headerExchangeBs.resize(3);
+   localBufferSize.resize(3);
+
+   for (uint_t j = 0; j <= levels; ++j)
+   {
+      headerExchangeBs[EQUAL_LEVEL].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
+      headerExchangeBs[COARSE_TO_FINE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
+      headerExchangeBs[FINE_TO_COARSE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
+
+      localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0));
+      localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0));
+      localBufferSize[FINE_TO_COARSE].push_back(mpi::MPISize(0));
+   }
+
+   for (auto& iBlock : *forest)
+   {
+      auto block = dynamic_cast< Block* >(&iBlock);
+      if (!selectable::isSetSelected(block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue;
+
+      const BlockID& senderId = block->getId();
+      auto nLevel             = block->getLevel();
+
+      for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+      {
+         // skip if block has no neighbors in this direction
+         const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+         if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+
+         if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx))
+         {
+            WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+            if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
+                                           incompatibleBlockSelectors_))
+               continue;
+            if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+               continue;
+
+            const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0));
+            auto nProcess             = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0)));
+
+            for (auto& pi : packInfos_)
+            {
+               receiverInfo[EQUAL_LEVEL][nLevel][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir));
+            }
+
+            auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][nLevel].sendBuffer(nProcess);
+            receiverId.toBuffer(headerBuffer);
+            senderId.toBuffer(headerBuffer);
+            headerBuffer << *dir;
+         }
+         else if (block->neighborhoodSectionHasSmallerBlocks(neighborIdx))
+         {
+            auto fineLevel = nLevel + uint_c(1); // For indexing always the fineLevel is taken to be consistent.
+            WALBERLA_ASSERT_LESS(fineLevel, levels)
+
+            for (uint_t n = 0; n != block->getNeighborhoodSectionSize(neighborIdx); ++n)
+            {
+               const BlockID& receiverId = block->getNeighborId(neighborIdx, n);
+               if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_,
+                                              incompatibleBlockSelectors_))
+                  continue;
+               if( block->neighborExistsLocally( neighborIdx, n ) )
+               {
+                  for (auto& pi : packInfos_)
+                     localBufferSize[COARSE_TO_FINE][fineLevel] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir));
+                  continue;
+               }
+
+               auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, n));
+               for (auto& pi : packInfos_)
+                  receiverInfo[COARSE_TO_FINE][fineLevel][nProcess] +=
+                     mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir));
+               auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel].sendBuffer(nProcess);
+               receiverId.toBuffer(headerBuffer);
+               senderId.toBuffer(headerBuffer);
+               headerBuffer << *dir;
+            }
+         }
+         else if (block->neighborhoodSectionHasLargerBlock(neighborIdx))
+         {
+            WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+
+            const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0));
+            if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
+                                           incompatibleBlockSelectors_))
+               continue;
+
+            if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+            {
+               for (auto& pi : packInfos_)
+                  localBufferSize[FINE_TO_COARSE][nLevel] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
+               continue;
+            }
+
+            auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0)));
+            for (auto& pi : packInfos_)
+               receiverInfo[FINE_TO_COARSE][nLevel][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
+
+            auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][nLevel].sendBuffer(nProcess);
+            receiverId.toBuffer(headerBuffer);
+            senderId.toBuffer(headerBuffer);
+            headerBuffer << *dir;
+         }
+      }
+   }
+
+   for (uint_t i = 0; i != 3; ++i)
+   {
+      for (uint_t j = 0; j <= levels; ++j)
+      {
+         headerExchangeBs[i][j].setReceiverInfoFromSendBufferState(false, true);
+         headerExchangeBs[i][j].sendAll();
+         for (auto recvIter = headerExchangeBs[i][j].begin(); recvIter != headerExchangeBs[i][j].end(); ++recvIter)
+         {
+            auto& headerVector = headers_[i][j][recvIter.rank()];
+            auto& buffer       = recvIter.buffer();
+            while (buffer.size())
+            {
+               Header header;
+               header.receiverId.fromBuffer(buffer);
+               header.senderId.fromBuffer(buffer);
+               buffer >> header.dir;
+               headerVector.push_back(header);
+            }
+         }
+
+         bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j]);
+         bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j]);
+
+         for (auto it : receiverInfo[i][j])
+         {
+            bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
+            bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
+         }
+         if (localBufferSize[i][j] > 0)
+            localBuffer_[i][j].resize(size_t(localBufferSize[i][j]));
+      }
+   }
+
+   forestModificationStamp_      = forest->getBlockForest().getModificationStamp();
+}
+
+template< typename Stencil >
+bool NonUniformGPUScheme< Stencil >::isAnyCommunicationInProgress() const
+{
+   for (auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt)
+      for (auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt)
+         if (*levelIt) return true;
+
+   return false;
+}
+
+template< typename Stencil >
+NonUniformGPUScheme< Stencil >::~NonUniformGPUScheme()
+{
+   for (uint_t i = 0; i != bufferSystemGPU_[EQUAL_LEVEL].size(); ++i)
+   {
+      waitCommunicateEqualLevel(i);
+      waitCommunicateCoarseToFine(i);
+      waitCommunicateFineToCoarse(i);
+   }
+}
+
+template< typename Stencil >
+void NonUniformGPUScheme< Stencil >::addPackInfo(const shared_ptr< GeneratedNonUniformGPUPackInfo >& pi)
+{
+   if (isAnyCommunicationInProgress())
+   {
+      WALBERLA_ABORT("You may not add a PackInfo to a NonUniformBufferedScheme if any communication is in progress!")
+   }
+   packInfos_.push_back(pi);
+   setupCommunication();
+}
+
+} // namespace walberla::gpu::communication
diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
similarity index 73%
rename from src/cuda/communication/UniformGPUScheme.h
rename to src/gpu/communication/UniformGPUScheme.h
index 173cfcc4c44166f7ff05e8963fbe7135123aba40..5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5 100644
--- a/src/cuda/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file UniformGPUScheme.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
@@ -22,20 +22,25 @@
 #pragma once
 
 #include "blockforest/StructuredBlockForest.h"
-#include "core/mpi/MPIWrapper.h"
+
 #include "core/mpi/BufferSystem.h"
+#include "core/mpi/MPIWrapper.h"
+
 #include "domain_decomposition/IBlock.h"
-#include "stencil/Directions.h"
 
-#include "cuda/CudaRAII.h"
-#include "cuda/communication/GeneratedGPUPackInfo.h"
-#include "cuda/communication/CustomMemoryBuffer.h"
-#include "cuda/ParallelStreams.h"
+#include "stencil/Directions.h"
 
 #include <thread>
 
+#include "gpu/GPURAII.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/CustomMemoryBuffer.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
@@ -46,21 +51,27 @@ namespace communication {
    public:
        explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
                                   bool sendDirectlyFromGPU = false,
+                                  bool useLocalCommunication = true,
                                   const int tag = 5432 );
 
        explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
                                  const Set<SUID> & requiredBlockSelectors,
                                  const Set<SUID> & incompatibleBlockSelectors,
                                  bool sendDirectlyFromGPU = false,
+                                 bool useLocalCommunication = true,
                                  const int tag = 5432 );
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( cudaStream_t stream = nullptr);
-       void wait( cudaStream_t stream = nullptr);
+       void startCommunication( gpuStream_t stream = nullptr);
+       void wait( gpuStream_t stream = nullptr);
+
+       void operator()( gpuStream_t stream = nullptr )         { communicate( stream ); }
+       inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
 
-      void operator()( cudaStream_t stream = nullptr )         { communicate( stream ); }
-      inline void communicate( cudaStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
+       std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr );
 
    private:
        void setupCommunication();
@@ -71,9 +82,10 @@ namespace communication {
        bool setupBeforeNextCommunication_;
        bool communicationInProgress_;
        bool sendFromGPU_;
+       bool useLocalCommunication_;
 
-       using CpuBuffer_T = cuda::communication::PinnedMemoryBuffer;
-       using GpuBuffer_T = cuda::communication::GPUMemoryBuffer;
+       using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer;
+       using GpuBuffer_T = gpu::communication::GPUMemoryBuffer;
 
        mpi::GenericBufferSystem<CpuBuffer_T, CpuBuffer_T> bufferSystemCPU_;
        mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_;
@@ -95,7 +107,7 @@ namespace communication {
 
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 #include "UniformGPUScheme.impl.h"
diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
similarity index 75%
rename from src/cuda/communication/UniformGPUScheme.impl.h
rename to src/gpu/communication/UniformGPUScheme.impl.h
index 089f03e78ec30b5ff7d2ca451ba0f82e41bcc0c8..8a8616c1e6cd371a987bd45a86e677b09d289883 100644
--- a/src/cuda/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -14,53 +14,72 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file UniformGPUScheme.impl.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
 
-#include "cuda/ParallelStreams.h"
+#include "gpu/ParallelStreams.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 namespace communication {
 
 
    template<typename Stencil>
    UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
                                                 bool sendDirectlyFromGPU,
+                                                bool useLocalCommunication,
                                                 const int tag )
         : blockForest_( bf ),
           setupBeforeNextCommunication_( true ),
           communicationInProgress_( false ),
           sendFromGPU_( sendDirectlyFromGPU ),
+          useLocalCommunication_(useLocalCommunication),
           bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
           bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
           parallelSectionManager_( -1 ),
           requiredBlockSelectors_( Set<SUID>::emptySet() ),
           incompatibleBlockSelectors_( Set<SUID>::emptySet() )
-   {}
+   {
+      WALBERLA_MPI_SECTION()
+      {
+#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+         WALBERLA_CHECK(!sendDirectlyFromGPU)
+#endif
+      }
+   }
 
    template<typename Stencil>
    UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
                                                 const Set<SUID> & requiredBlockSelectors,
                                                 const Set<SUID> & incompatibleBlockSelectors,
                                                 bool sendDirectlyFromGPU,
+                                                bool useLocalCommunication,
                                                 const int tag )
       : blockForest_( bf ),
         setupBeforeNextCommunication_( true ),
         communicationInProgress_( false ),
         sendFromGPU_( sendDirectlyFromGPU ),
+        useLocalCommunication_(useLocalCommunication),
         bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
         bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
         parallelSectionManager_( -1 ),
         requiredBlockSelectors_( requiredBlockSelectors ),
         incompatibleBlockSelectors_( incompatibleBlockSelectors )
-   {}
+   {
+      WALBERLA_MPI_SECTION()
+      {
+#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+         WALBERLA_CHECK(!sendDirectlyFromGPU)
+#endif
+      }
+   }
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream )
+   void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream )
    {
       WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
@@ -85,43 +104,55 @@ namespace communication {
          auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto &iBlock : *forest )
          {
-            auto block = dynamic_cast< Block * >( &iBlock );
+            auto senderBlock = dynamic_cast< Block * >( &iBlock );
 
-            if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
+            if( !selectable::isSetSelected( senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
                continue;
 
             for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
             {
                const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
-               if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
+
+               if( senderBlock->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
                   continue;
-               auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
 
-               if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
+               if( !selectable::isSetSelected( senderBlock->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
                   continue;
 
-               for( auto &pi : packInfos_ )
+               if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) && useLocalCommunication_ )
                {
-                  parallelSection.run([&](auto s) {
-                     auto size = pi->size( *dir, block );
+                  auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
+                  for (auto& pi : packInfos_)
+                  {
+                     pi->communicateLocal(*dir, senderBlock, receiverBlock, stream);
+                  }
+               }
+               else
+               {
+                  auto nProcess = mpi::MPIRank( senderBlock->getNeighborProcess( neighborIdx, uint_t( 0 )));
+
+                  for( auto &pi : packInfos_ )
+                  {
+                     parallelSection.run([&](auto s) {
+                     auto size = pi->size( *dir, senderBlock );
                      auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
                      WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                     pi->pack( *dir, gpuDataPtr, block, s );
+                     pi->pack( *dir, gpuDataPtr, senderBlock, s );
 
                      if( !sendFromGPU_ )
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                         WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
-                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ))
+                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s ))
                      }
-                  });
+                     });
+                  }
                }
             }
          }
       }
-
       // wait for packing to finish
-      cudaStreamSynchronize( stream );
+      WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) );
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -133,7 +164,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait( cudaStream_t stream )
+   void UniformGPUScheme<Stencil>::wait( gpuStream_t stream )
    {
       WALBERLA_ASSERT( communicationInProgress_ )
 
@@ -180,10 +211,9 @@ namespace communication {
                   auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-
                   parallelSection.run([&](auto s) {
-                     WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           cudaMemcpyHostToDevice, s ))
+                     WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
+                                                           gpuMemcpyHostToDevice, s ))
                      pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                   });
                }
@@ -191,6 +221,7 @@ namespace communication {
          }
       }
 
+      WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
       communicationInProgress_ = false;
    }
 
@@ -215,6 +246,7 @@ namespace communication {
          for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) {
             // skip if block has no neighbors in this direction
             const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
+
             if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
                continue;
 
@@ -228,6 +260,9 @@ namespace communication {
             if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
                continue;
 
+            if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) && useLocalCommunication_ )
+               continue;
+
             auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
 
             for( auto &pi : packInfos_ )
@@ -273,7 +308,24 @@ namespace communication {
       setupBeforeNextCommunication_ = true;
    }
 
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { communicate( stream ); };
+   }
+
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { startCommunication( stream ); };
+   }
+
+   template< typename Stencil >
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream)
+   {
+      return [this, stream]() { wait( stream ); };
+   }
 
 } // namespace communication
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
diff --git a/src/cuda/doc/drawing.svg b/src/gpu/doc/drawing.svg
similarity index 99%
rename from src/cuda/doc/drawing.svg
rename to src/gpu/doc/drawing.svg
index 4e356d3f301c16035e3c87dbbb7674d6af2459e6..b931580f55ad6368062073e681c708f95fe9bab4 100644
--- a/src/cuda/doc/drawing.svg
+++ b/src/gpu/doc/drawing.svg
@@ -135,7 +135,7 @@
          id="tspan3761-6"
          x="50"
          y="222.36218"
-         style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">cuda::GPUField</tspan></text>
+         style="font-size:24px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Monospace;-inkscape-font-specification:Monospace">gpu::GPUField</tspan></text>
     <rect
        style="fill:#c7ffea;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.71999997px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
        id="rect3757-0"
diff --git a/src/cuda/doc/fieldAccess.png b/src/gpu/doc/fieldAccess.png
similarity index 100%
rename from src/cuda/doc/fieldAccess.png
rename to src/gpu/doc/fieldAccess.png
diff --git a/src/gpu/doc/gpu.dox b/src/gpu/doc/gpu.dox
new file mode 100644
index 0000000000000000000000000000000000000000..83f8e78dc9b5dd448f22027f94c68ab798458f01
--- /dev/null
+++ b/src/gpu/doc/gpu.dox
@@ -0,0 +1,108 @@
+
+namespace walberla{
+/*!
+
+\page gpuPage Introduction to GPU Programming with waLBerla
+
+WaLBerla is a high-performance computing framework that supports GPU computing using either CUDA or HIP.
+In this tutorial, we will provide an overview of the GPU concepts in WaLBerla and show you how to create GPU fields
+and write GPU kernels using the provided indexing strategies.
+
+\section gpuBasicWrapper Basics
+
+waLBerla supports GPUs through a simple wrapper around both CUDA and HIP libraries.
+This allows users to write GPU-accelerated code that can run on both NVIDIA and AMD GPUs. In the following we will explain
+the concept with a simple example on how to allocate memory on GPUs. To create and manage GPU memory in waLBerla,
+the gpuMalloc function should be used always, which is defined depending on the build system used to compile waLBerla.
+Specifically, if waLBerla was built with CUDA, `gpuMalloc` is defined as `cudaMalloc`, while if it was built with HIP,
+`gpuMalloc` is defined as `hipMalloc`. This allows users to write GPU-accelerated code that can run on both NVIDIA and AMD GPUs.
+Here's an example of how to create a GPU array of 100 floats and set its values to zero using waLBerla:
+
+\code
+#include "gpu/GPUWrapper.h"
+
+int main()
+{
+  float* d_array;
+  gpuMalloc((void**)&d_array, 100 * sizeof(float));
+  gpuMemset(d_array, 0, 100 * sizeof(float));
+  // ...
+  return 0;
+}
+\endcode
+
+In conclusion, waLBerla provides a simple wrapper around both CUDA and HIP libraries to allow users to write
+GPU-accelerated code that can run on both NVIDIA and AMD GPUs. This wrapper is used through the entire backend of waLBerla
+and thus for all higher level functionality. As a user most of the time the higher level functionality will be used
+and the wrapper is more important for developers. As a next step and introduction to some of the higher level functionality follows.
+
+\section gpuField Creating and Copying GPU Fields
+
+To create a GPU field in WaLBerla, you can use the gpu::GPUField class, which is similar to the field::GhostLayerField class used for CPU fields.
+You can copy data between the host and device using the gpu::fieldCpy function, as shown in the following example:
+
+
+\subsection gpuFieldOverview Creating GPU fields and copy them between host and device
+
+   \code
+    GhostLayerField<double,4> h_f(16, 20, 30, 1, 42.0, field::fzyx);
+    gpu::GPUField<double> d_f(16, 20, 30, 4, 1, field::fzyx);
+    gpu::fieldCpy(d_f, h_f); // copy from host to device
+    some_kernel_wrapper(d_f); // run some kernel
+    gpu::fieldCpy(h_f, d_f); // copy field data back to host
+
+   \endcode
+
+Note that gpu::GPUField has a template parameter for the number of fields (or channels), whereas field::GhostLayerField
+has a template parameter for the size of each field. Also, GPU fields can be accessed using gpu::FieldAccessor objects, which we will discuss next.
+
+\section gpuKernels Writing and Executing GPU Kernels
+
+\subsection gpuFieldAccess Writing GPU Kernels with Indexing Strategies
+
+  \image html gpu/doc/fieldAccess.png "Accessing fields in CUDA kernels"
+
+   When writing a kernel that operates on a field, the first task is to distribute the data to threads and blocks.
+   We need a function $(blockIdx, threadIdx) \\rightarrow (x,y,z)$ or $(blockIdx, threadIdx) \\rightarrow (x,y,z,f)$.
+   The optimal mapping depends on many parameters: for example which layout the field has, the extends of each coordinate,
+   hardware parameters like warp-size, etc.
+   Thus this indexing function is abstracted. A few indexing strategies are already implemented which can be
+   substituted by custom strategies.
+   A indexing strategy consists of two classes: and somewhat complex Indexing class, which manages the
+   indexing on the host-side and a lightweight Accessor class, which is passed to the GPU kernel.
+
+   An indexing scheme is very similar to the iterator concept, it defines the bounds of the iteration, which is not necessarily the
+   complete field but could also be a certain sub-block, for example the ghost layer in a certain direction.
+
+
+   Lets start to write a simple kernel that doubles all values stored in a field:
+   \code
+   #include "gpu/FieldAccessor.h"
+
+   __global__ void kernel_double( gpu::FieldAccessor<double> f )
+   {
+      f.set( blockIdx, threadIdx );
+      f.get() *= 2.0;
+   }
+   \endcode
+   We do not have to care about indexing, the gpu::FieldAccessor takes care of that. So this is a generic kernel that operates
+   on double fields. Using the gpu::FieldAccessor the current and neighboring values can be accessed and manipulated.
+
+   This kernel can be called like this:
+
+   \code
+   gpu::FieldIndexing<double> indexing = gpu::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( field, 1, stencil::E, true );
+   kernel_double<<< iter.gridDim(), iter.blockDim() >>> ( iter.gpuAccess() );
+   \endcode
+
+   In the example above we only iterate over a slice of the field. Of course we can also iterate over the complete field, there are
+   various static member functions in a Indexing class to create certain iteration patterns.
+   The Indexing class encapsulates the information of how to launch the kernel (blockDim and gridDim) and holds the Accessor class that
+   is passed to the kernel.
+
+   Two indexing strategies are currently provided:
+      - gpu::FieldIndexing   and  gpu::FieldAccessor (general, but slow )
+      - gpu::FieldIndexingXYZ  and gpu::FieldAccessorXYZ ( optimized for cell based iterating over bigger chunks, for fields where xSize bigger than warpSize )
+
+*/
+}
diff --git a/src/cuda/ideasForCommunication.txt b/src/gpu/ideasForCommunication.txt
similarity index 100%
rename from src/cuda/ideasForCommunication.txt
rename to src/gpu/ideasForCommunication.txt
diff --git a/src/cuda/lbm/CMakeLists.txt b/src/gpu/lbm/CMakeLists.txt
similarity index 72%
rename from src/cuda/lbm/CMakeLists.txt
rename to src/gpu/lbm/CMakeLists.txt
index a2db712aa018fd306019a4c43bf177f536f0c2f9..ae7f60ac849311c4b01a7290b7c0b57d31f06ffe 100644
--- a/src/cuda/lbm/CMakeLists.txt
+++ b/src/gpu/lbm/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     CombinedInPlaceGpuPackInfo.h     
     )
diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
similarity index 91%
rename from src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
rename to src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
index c47d815c111efb82e2d199f17763d620744397b4..cabae3221ac6d40be432f6ab3b6a9179736b10a6 100644
--- a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
+++ b/src/gpu/lbm/CombinedInPlaceGpuPackInfo.h
@@ -22,15 +22,16 @@
 
 #define IS_EVEN(x) ((x & 1) ^ 1)
 
-#include "cuda/communication/GeneratedGPUPackInfo.h"
-
 #include "lbm/inplace_streaming/TimestepTracker.h"
 
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
 namespace walberla {
 namespace lbm {
 
 template< typename EvenPackInfo, typename OddPackInfo >
-class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
+class CombinedInPlaceGpuPackInfo : public gpu::GeneratedGPUPackInfo
 {
  public:
    template< typename... Args >
@@ -40,7 +41,7 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
 
    virtual ~CombinedInPlaceGpuPackInfo() = default;
 
-   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override
    {
       if (IS_EVEN(tracker_->getCounter()))
       {
@@ -52,7 +53,7 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
       }
    }
 
-   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override {
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override {
       if (IS_EVEN(tracker_->getCounter()))
       {
          evenPackInfo_.unpack(dir, buffer, block, stream);
diff --git a/src/cuda/sweeps/CMakeLists.txt b/src/gpu/sweeps/CMakeLists.txt
similarity index 66%
rename from src/cuda/sweeps/CMakeLists.txt
rename to src/gpu/sweeps/CMakeLists.txt
index 188a4cbae837e9e649f21c242f7a33e4fdbcc7ff..2126d798ceaa54823dfced6ad447039f0121f4b2 100644
--- a/src/cuda/sweeps/CMakeLists.txt
+++ b/src/gpu/sweeps/CMakeLists.txt
@@ -1,4 +1,4 @@
-target_sources( cuda
+target_sources( gpu
     PRIVATE
     GPUSweepBase.h     
     )
diff --git a/src/cuda/sweeps/GPUSweepBase.h b/src/gpu/sweeps/GPUSweepBase.h
similarity index 92%
rename from src/cuda/sweeps/GPUSweepBase.h
rename to src/gpu/sweeps/GPUSweepBase.h
index fbd5e2f8e6ff688a95c2e13425f58ff49085b8c9..f8e61dd14fa58246a4b28e65fa3e8edf6a9a8774 100644
--- a/src/cuda/sweeps/GPUSweepBase.h
+++ b/src/gpu/sweeps/GPUSweepBase.h
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUSweepBase.h
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
 //
 //======================================================================================================================
@@ -22,7 +22,7 @@
 #pragma once
 
 
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
 
 #include "core/debug/Debug.h"
 
@@ -31,16 +31,15 @@
 #include <set>
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 template < typename GPUField_T >
 class GPUSweepBase
 {
 public:
-   GPUSweepBase()
-   {
-   }
+   GPUSweepBase() = default;
    virtual ~GPUSweepBase()
    {
       for( auto field = dstFields_.begin(); field != dstFields_.end(); ++field )
@@ -58,7 +57,7 @@ public:
       }
 
       GPUField_T * dst = src->cloneUninitialized();
-      WALBERLA_ASSERT_NOT_NULLPTR( dst );
+      WALBERLA_ASSERT_NOT_NULLPTR( dst )
 
       dstFields_.insert( dst );
 
@@ -71,6 +70,6 @@ protected:
 };
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
diff --git a/src/lbm/blockforest/communication/SimpleCommunication.h b/src/lbm/blockforest/communication/SimpleCommunication.h
index 764152da65f081b164b102a41f3d639db5ae9065..7e256f92977f04a2c7e3d7d1321b409b3eb883fb 100644
--- a/src/lbm/blockforest/communication/SimpleCommunication.h
+++ b/src/lbm/blockforest/communication/SimpleCommunication.h
@@ -47,6 +47,7 @@ class SimpleCommunication : public communication::UniformBufferedScheme< Stencil
    using VectorFieldFlattened_T = GhostLayerField< real_t, 3 >;
    using PdfField_T             = GhostLayerField< real_t, Stencil_T::Size >;
    using UintScalarField_T      = GhostLayerField< uint_t, 1 >;
+   using IDScalarField_T        = walberla::GhostLayerField< walberla::id_t, 1 >;
 
    using FlagField16_T = FlagField< uint16_t >;
    using FlagField32_T = FlagField< uint32_t >;
@@ -148,17 +149,24 @@ class SimpleCommunication : public communication::UniformBufferedScheme< Stencil
                      }
                      else
                      {
-                        if (firstBlock.isDataClassOrSubclassOf< UintScalarField_T >(fieldId))
+                        if (firstBlock.isDataClassOrSubclassOf< IDScalarField_T >(fieldId))
                         {
-                           this->addPackInfo(make_shared< PackInfo< UintScalarField_T > >(fieldId));
+                           this->addPackInfo(make_shared< PackInfo< IDScalarField_T > >(fieldId));
                         }
                         else
                         {
-                           if (firstBlock.isDataClassOrSubclassOf< VectorFieldFlattened_T >(fieldId))
+                           if (firstBlock.isDataClassOrSubclassOf< UintScalarField_T >(fieldId))
                            {
-                              this->addPackInfo(make_shared< PackInfo< VectorFieldFlattened_T > >(fieldId));
+                              this->addPackInfo(make_shared< PackInfo< UintScalarField_T > >(fieldId));
+                           }
+                           else
+                           {
+                              if (firstBlock.isDataClassOrSubclassOf< VectorFieldFlattened_T >(fieldId))
+                              {
+                                 this->addPackInfo(make_shared< PackInfo< VectorFieldFlattened_T > >(fieldId));
+                              }
+                              else { WALBERLA_ABORT("Problem with UID"); }
                            }
-                           else { WALBERLA_ABORT("Problem with UID"); }
                         }
                      }
                   }
diff --git a/src/lbm/boundary/Curved.h b/src/lbm/boundary/Curved.h
index 38e3d308ca03dbf6312c0d6e5badc35f90ee934d..e6c9659f6a2791080832f50cf05d7373396ce9b9 100644
--- a/src/lbm/boundary/Curved.h
+++ b/src/lbm/boundary/Curved.h
@@ -145,7 +145,7 @@ inline Curved< LatticeModel_T, FlagField_T >::Curved( const BoundaryUID & bounda
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    WALBERLA_ASSERT_NOT_NULLPTR( flagField_ );
    WALBERLA_ASSERT( flagField_->isRegistered( domainMask_ )  );   
-   weights_ = make_shared<WeightField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField_->nrOfGhostLayers(), field::zyxf );
+   weights_ = make_shared<WeightField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField_->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/boundary/DiffusionDirichlet.h b/src/lbm/boundary/DiffusionDirichlet.h
index 219732fc28df01d43232d48c42621d43b4355120..7ffa062a01770210b398a4ba5fa9ff3fa475cbaf 100644
--- a/src/lbm/boundary/DiffusionDirichlet.h
+++ b/src/lbm/boundary/DiffusionDirichlet.h
@@ -123,9 +123,9 @@ inline DiffusionDirichlet< LatticeModel_T, flag_t >::DiffusionDirichlet( const B
 {
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    if (flagField != nullptr)
-      sclField_ = make_shared<ScalarField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
+      sclField_ = make_shared<ScalarField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
    else
-      sclField_ = make_shared<ScalarField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField->nrOfGhostLayers(), field::zyxf );
+      sclField_ = make_shared<ScalarField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/boundary/DynamicUBB.h b/src/lbm/boundary/DynamicUBB.h
index de6c9db27703fdf466530a8fd7fdc191cd1663d2..65d6692c828f271580419637c736be01fa375aad 100644
--- a/src/lbm/boundary/DynamicUBB.h
+++ b/src/lbm/boundary/DynamicUBB.h
@@ -87,7 +87,7 @@ public:
       }
 
       if (StoreForce)
-         force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+         force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
    }
    DynamicUBB( const BoundaryUID & boundaryUID, const FlagUID & uid, PDFField * const pdfField,
                const uint_t level, const VelocityFunctor_T & velocity, const AABB & aabb ) :
diff --git a/src/lbm/boundary/NoSlip.h b/src/lbm/boundary/NoSlip.h
index 3c52f58729e68ee8098b0d3aff2db8231b462275..43b675e3ab568cafa7cdf47aca2fd6b033815cd5 100644
--- a/src/lbm/boundary/NoSlip.h
+++ b/src/lbm/boundary/NoSlip.h
@@ -67,7 +67,7 @@ public:
          WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
 
          if (StoreForce)
-            force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+            force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
       }
 
    void pushFlags( std::vector< FlagUID >& uids ) const { uids.push_back( uid_ ); }
diff --git a/src/lbm/boundary/ParserUBB.h b/src/lbm/boundary/ParserUBB.h
index 847faeab7cd54627211676ac42f8ff082c3fab40..309b44c472ed80ad7168145c252b6a312bb5f614 100644
--- a/src/lbm/boundary/ParserUBB.h
+++ b/src/lbm/boundary/ParserUBB.h
@@ -299,17 +299,17 @@ inline ParserUBB<LatticeModel_T, flag_t, AdaptVelocityToExternalForce, StoreForc
 
    if(flagField != NULL)
    {
-      parserField_   = make_shared<ParserField>  ( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
-      velocityField_ = make_shared<VelocityField>( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
+      parserField_   = make_shared<ParserField>  ( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
+      velocityField_ = make_shared<VelocityField>( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
    }
    else
    {
-      parserField_   = make_shared<ParserField>  ( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), pdfField->nrOfGhostLayers(),  field::zyxf );
-      velocityField_ = make_shared<VelocityField>( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), pdfField->nrOfGhostLayers(),  field::zyxf );
+      parserField_   = make_shared<ParserField>  ( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), pdfField->nrOfGhostLayers(),  field::fzyx );
+      velocityField_ = make_shared<VelocityField>( pdfField->xSize(), pdfField->ySize(), pdfField->zSize(), pdfField->nrOfGhostLayers(),  field::fzyx );
    }
 
    if (StoreForce)
-      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/boundary/Pressure.h b/src/lbm/boundary/Pressure.h
index bbb93f77099983e9cd2aee69411c5c2b0fabe131..51dc64c2aa5ce044d830ea302a050ef43ba386c8 100644
--- a/src/lbm/boundary/Pressure.h
+++ b/src/lbm/boundary/Pressure.h
@@ -125,9 +125,9 @@ inline Pressure< LatticeModel_T, flag_t>::Pressure( const BoundaryUID & boundary
 
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    if (flagField != NULL)
-      latticeDensityField_ = make_shared<LatticeDensityField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
+      latticeDensityField_ = make_shared<LatticeDensityField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
    else
-      latticeDensityField_ = make_shared<LatticeDensityField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      latticeDensityField_ = make_shared<LatticeDensityField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/boundary/SimpleUBB.h b/src/lbm/boundary/SimpleUBB.h
index 8c1c93af05c78372adb27f5b553baa5495372663..114f0a0928f27a026f443bd0aa8efd4d951ff060 100644
--- a/src/lbm/boundary/SimpleUBB.h
+++ b/src/lbm/boundary/SimpleUBB.h
@@ -68,7 +68,7 @@ public:
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
 
    if (StoreForce)
-      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
    SimpleUBB( const BoundaryUID& boundaryUID, const FlagUID& uid, PDFField* const pdfField, const real_t x, const real_t y, const real_t z ) :
@@ -77,7 +77,7 @@ public:
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
 
    if (StoreForce)
-      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
    void pushFlags( std::vector< FlagUID >& uids ) const { uids.push_back( uid_ ); }
diff --git a/src/lbm/boundary/UBB.h b/src/lbm/boundary/UBB.h
index c866776d2cba47c20107a83b531bbb21c35c78e2..9b1e46fa72895b87325ba7a8c4f63d827753a241 100644
--- a/src/lbm/boundary/UBB.h
+++ b/src/lbm/boundary/UBB.h
@@ -147,12 +147,12 @@ inline UBB< LatticeModel_T, flag_t, AdaptVelocityToExternalForce, StoreForce >::
 {
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    if (flagField != nullptr)
-      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
+      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
    else
-      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 
    if (StoreForce)
-      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      force_ = make_shared<ForceField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/boundary/VelocityBoundary.h b/src/lbm/boundary/VelocityBoundary.h
index c5be0e0a6370c9cd029dce73ac0df51b59c728bf..72901642c3d261e9cfd4496d80f5e16ce743a306 100644
--- a/src/lbm/boundary/VelocityBoundary.h
+++ b/src/lbm/boundary/VelocityBoundary.h
@@ -143,9 +143,9 @@ inline VelocityBoundary< LatticeModel_T, flag_t >::VelocityBoundary( const Bound
 {
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    if (flagField != NULL)
-      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::zyxf );
+      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), flagField->nrOfGhostLayers(), field::fzyx );
    else
-      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::zyxf );
+      vel_ = make_shared<VelField>( pdfField_->xSize(), pdfField_->ySize(), pdfField_->zSize(), pdfField_->nrOfGhostLayers(), field::fzyx );
 }
 
 
diff --git a/src/lbm/field/AddToStorage.h b/src/lbm/field/AddToStorage.h
index f36da19b132fa3ca2c14f25243389b33cccf4881..fcf563c4088552d0b2f0ce283d236d4afe62307a 100644
--- a/src/lbm/field/AddToStorage.h
+++ b/src/lbm/field/AddToStorage.h
@@ -158,7 +158,7 @@ private:
 template< typename LatticeModel_T, typename BlockStorage_T >
 BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
                                   const LatticeModel_T & latticeModel,
-                                  const field::Layout & layout = field::zyxf,
+                                  const field::Layout & layout = field::fzyx,
                                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
                                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
                                   const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
@@ -173,7 +173,7 @@ template< typename LatticeModel_T, typename BlockStorage_T >
 BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
                                   const LatticeModel_T & latticeModel,
                                   const uint_t ghostLayers,
-                                  const field::Layout & layout = field::zyxf,
+                                  const field::Layout & layout = field::fzyx,
                                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
                                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
                                   const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
@@ -189,7 +189,7 @@ template< typename LatticeModel_T, typename BlockStorage_T >
 BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
                                   const LatticeModel_T & latticeModel,
                                   const Vector3< real_t > & initialVelocity, const real_t initialDensity,
-                                  const field::Layout & layout = field::zyxf,
+                                  const field::Layout & layout = field::fzyx,
                                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
                                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
                                   const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
@@ -206,7 +206,7 @@ BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, c
                                   const LatticeModel_T & latticeModel,
                                   const Vector3< real_t > & initialVelocity, const real_t initialDensity,
                                   const uint_t ghostLayers,
-                                  const field::Layout & layout = field::zyxf,
+                                  const field::Layout & layout = field::fzyx,
                                   const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
                                   const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
                                   const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
@@ -227,7 +227,7 @@ struct PdfFieldCreator : public domain_decomposition::BlockDataCreator< lbm::Pdf
    PdfFieldCreator( const shared_ptr< StructuredBlockStorage > & blocks,
                     const std::string & identifier, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors,
                     const LatticeModel_T & latticeModel,
-                    const field::Layout & layout = field::zyxf,
+                    const field::Layout & layout = field::fzyx,
                     const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) :
       domain_decomposition::BlockDataCreator< lbm::PdfField< LatticeModel_T > >( make_shared< internal::PdfFieldHandling< LatticeModel_T > >(
                                                                                     blocks, latticeModel, false, Vector3<real_t>(0), real_t(1), uint_t(1), layout, alloc ),
@@ -237,7 +237,7 @@ struct PdfFieldCreator : public domain_decomposition::BlockDataCreator< lbm::Pdf
    PdfFieldCreator( const shared_ptr< StructuredBlockStorage > & blocks,
                     const std::string & identifier, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors,
                     const LatticeModel_T & latticeModel, const uint_t ghostLayers,
-                    const field::Layout & layout = field::zyxf,
+                    const field::Layout & layout = field::fzyx,
                     const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) :
       domain_decomposition::BlockDataCreator< lbm::PdfField< LatticeModel_T > >( make_shared< internal::PdfFieldHandling< LatticeModel_T > >(
                                                                                     blocks, latticeModel, false, Vector3<real_t>(0), real_t(1), ghostLayers, layout, alloc ),
@@ -247,7 +247,7 @@ struct PdfFieldCreator : public domain_decomposition::BlockDataCreator< lbm::Pdf
    PdfFieldCreator( const shared_ptr< StructuredBlockStorage > & blocks,
                     const std::string & identifier, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors,
                     const LatticeModel_T & latticeModel, const Vector3< real_t > & initialVelocity, const real_t initialDensity,
-                    const field::Layout & layout = field::zyxf,
+                    const field::Layout & layout = field::fzyx,
                     const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) :
       domain_decomposition::BlockDataCreator< lbm::PdfField< LatticeModel_T > >( make_shared< internal::PdfFieldHandling< LatticeModel_T > >(
                                                                                     blocks, latticeModel, true, initialVelocity, initialDensity, uint_t(1), layout, alloc ),
@@ -257,7 +257,7 @@ struct PdfFieldCreator : public domain_decomposition::BlockDataCreator< lbm::Pdf
    PdfFieldCreator( const shared_ptr< StructuredBlockStorage > & blocks,
                     const std::string & identifier, const Set<SUID> & requiredSelectors, const Set<SUID> & incompatibleSelectors,
                     const LatticeModel_T & latticeModel, const Vector3< real_t > & initialVelocity, const real_t initialDensity, const uint_t ghostLayers,
-                    const field::Layout & layout = field::zyxf,
+                    const field::Layout & layout = field::fzyx,
                     const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr) :
       domain_decomposition::BlockDataCreator< lbm::PdfField< LatticeModel_T > >( make_shared< internal::PdfFieldHandling< LatticeModel_T > >(
                                                                                     blocks, latticeModel, true, initialVelocity, initialDensity, ghostLayers, layout, alloc ),
diff --git a/src/lbm/field/PdfField.h b/src/lbm/field/PdfField.h
index 178804a8888d83d648e7c06a64dbe8951b5919fe..73e90220ee7e92b3489df2abe1a8bdd70ff94f46 100644
--- a/src/lbm/field/PdfField.h
+++ b/src/lbm/field/PdfField.h
@@ -101,7 +101,7 @@ public:
              const LatticeModel_T & _latticeModel,
              const bool initialize = true, const Vector3< real_t > & initialVelocity = Vector3< real_t >( real_t(0.0) ),
              const real_t initialDensity = real_t(1.0),
-             const uint_t ghostLayers = uint_t(1), const field::Layout & _layout = field::zyxf,
+             const uint_t ghostLayers = uint_t(1), const field::Layout & _layout = field::fzyx,
              const shared_ptr< field::FieldAllocator<real_t> > & alloc = shared_ptr< field::FieldAllocator<real_t> >() );
 
    ~PdfField() override = default;
diff --git a/src/lbm/free_surface/InitFunctions.h b/src/lbm/free_surface/InitFunctions.h
index 22cd09c62c794260668a70c30b07f52495c3a780..0dd2a0bbb25938f82ebc4a0957238e7852a086f6 100644
--- a/src/lbm/free_surface/InitFunctions.h
+++ b/src/lbm/free_surface/InitFunctions.h
@@ -85,7 +85,7 @@ void initFlagsFromFillLevels(const std::weak_ptr< StructuredBlockForest >& block
          // set flags only in non-boundary and non-obstacle cells
          if (!handling->isBoundary(fillFieldIt.x(), fillFieldIt.y(), fillFieldIt.z()))
          {
-            if (*fillFieldIt <= real_c(0))
+            if (floatIsEqual(*fillFieldIt, real_c(0), real_c(1e-14)))
             {
                // set gas flag
                handling->forceFlag(flagInfo.gasFlag, fillFieldIt.x(), fillFieldIt.y(), fillFieldIt.z());
diff --git a/src/lbm/free_surface/InterfaceFromFillLevel.h b/src/lbm/free_surface/InterfaceFromFillLevel.h
index ef113e327032695d73df64109f0581c3b03cc4bd..f50c16cdd046f6bea042e163bdec5880f5172945 100644
--- a/src/lbm/free_surface/InterfaceFromFillLevel.h
+++ b/src/lbm/free_surface/InterfaceFromFillLevel.h
@@ -44,7 +44,7 @@ inline bool isInterfaceFromFillLevel(const ScalarField_T& fillField, cell_idx_t
    real_t fillLevel = fillField.get(x, y, z);
 
    // this cell is regular gas cell
-   if (fillLevel <= real_c(0.0)) { return false; }
+   if (floatIsEqual(fillLevel, real_c(0.0), real_c(1e-14))) { return false; }
 
    // this cell is regular interface cell
    if (fillLevel < real_c(1.0)) { return true; }
diff --git a/src/lbm/free_surface/VtkWriter.h b/src/lbm/free_surface/VtkWriter.h
index 726e0ac735ef4495d234532a7e3eb3e52c0372a8..f8d3b453d79edc572e8d697ebc921b615a2f2486 100644
--- a/src/lbm/free_surface/VtkWriter.h
+++ b/src/lbm/free_surface/VtkWriter.h
@@ -87,6 +87,7 @@ void addVTKOutput(const std::weak_ptr< StructuredBlockForest >& blockForestPtr,
       writers.push_back(std::make_shared< VTKWriter< VectorField_T, float > >(normalFieldID, "normal"));
       writers.push_back(
          std::make_shared< VTKWriter< VectorField_T, float > >(obstacleNormalFieldID, "obstacle_normal"));
+
       if constexpr (useCodegen)
       {
          if (forceDensityFieldID != BlockDataID())
diff --git a/src/lbm/free_surface/dynamics/functionality/AdvectMass.h b/src/lbm/free_surface/dynamics/functionality/AdvectMass.h
index 9ec9e655c41c877c55f3db764c363a947ea5943f..971529a1b78400cab191209a111686f0f1abf01f 100644
--- a/src/lbm/free_surface/dynamics/functionality/AdvectMass.h
+++ b/src/lbm/free_surface/dynamics/functionality/AdvectMass.h
@@ -178,7 +178,19 @@ real_t advectMass(const FlagField_T* flagField, const ConstScalarIt_T& fillSrc,
       }
 
       // PDF pointing from neighbor to current cell
-      const real_t neighborPdf = pdfFieldIt.neighbor(*dir, dir.toInvIdx());
+      real_t neighborPdf = real_c(0);
+      if (!isFreeSlip) { neighborPdf = pdfFieldIt.neighbor(*dir, dir.toInvIdx()); }
+      else
+      {
+         // get PDF reflected at free slip boundary condition
+         stencil::Direction neighborPdfDir = *dir;
+
+         if (freeSlipDir[0] != cell_idx_c(0)) { neighborPdfDir = stencil::mirrorX[neighborPdfDir]; }
+         if (freeSlipDir[1] != cell_idx_c(0)) { neighborPdfDir = stencil::mirrorY[neighborPdfDir]; }
+         if (freeSlipDir[2] != cell_idx_c(0)) { neighborPdfDir = stencil::mirrorZ[neighborPdfDir]; }
+         neighborPdf = pdfFieldIt.neighbor(freeSlipDir[0], freeSlipDir[1], freeSlipDir[2],
+                                           LatticeModel_T::Stencil::idx[neighborPdfDir]);
+      }
 
       // PDF pointing to neighbor
       const real_t localPdf = pdfFieldIt.getF(dir.toIdx());
diff --git a/src/lbm/lattice_model/LatticeModelBase.h b/src/lbm/lattice_model/LatticeModelBase.h
index 568800a4d0434a5e87c6b00078fe8234209c077b..e8db886f5be20469e28bd660e49eaec8eda2e9b5 100644
--- a/src/lbm/lattice_model/LatticeModelBase.h
+++ b/src/lbm/lattice_model/LatticeModelBase.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -83,7 +83,8 @@ public:
    LatticeModelBase( const CollisionModel_T & cm, const ForceModel_T & fm ) :
       collisionModel_( cm ), forceModel_( fm ) {
 
-      if (Compressible && forceModel_.constant)
+      if (Compressible && forceModel_.constant &&
+          !std::is_same< typename ForceModel::tag, force_model::None_tag >::value)
       {
          WALBERLA_LOG_WARNING_ON_ROOT("WARNING: You are using a compressible lattice model with a constant force "
                                       "model. You should consider using a field-based force model, and adjust the body "
@@ -132,8 +133,8 @@ public:
 protected:
 
    virtual void config( IBlock & block, StructuredBlockStorage & sbs ) = 0;
-   
-   
+
+
 
    CollisionModel_T collisionModel_;
    ForceModel_T     forceModel_;
diff --git a/src/lbm_generated/CMakeLists.txt b/src/lbm_generated/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2513a58f2e646025fa86107409058a7576a3f62f
--- /dev/null
+++ b/src/lbm_generated/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_library( lbm_generated)
+
+target_link_libraries( lbm_generated
+        PUBLIC
+        blockforest
+        boundary
+        communication
+        core
+        domain_decomposition
+        field
+        geometry
+        gui
+        stencil
+        timeloop
+        vtk
+        )
+
+add_subdirectory( boundary )
+add_subdirectory( communication )
+add_subdirectory( gpu )
+add_subdirectory( evaluation )
+add_subdirectory( field )
+add_subdirectory( refinement )
+add_subdirectory( storage_specification )
+add_subdirectory( sweep_collection )
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/CMakeLists.txt b/src/lbm_generated/boundary/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..201337a88fa1547002e0266b8837f369cf893b59
--- /dev/null
+++ b/src/lbm_generated/boundary/CMakeLists.txt
@@ -0,0 +1,25 @@
+target_sources( lbm_generated
+        PRIVATE
+        D3Q19BoundaryCollection.h
+        D3Q27BoundaryCollection.h
+        FreeSlipD3Q19.h
+        FreeSlipD3Q19.cpp
+        FreeSlipD3Q27.h
+        FreeSlipD3Q27.cpp
+        FixedDensityD3Q19.h
+        FixedDensityD3Q19.cpp
+        FixedDensityD3Q27.h
+        FixedDensityD3Q27.cpp
+        NoSlipD3Q19.h
+        NoSlipD3Q19.cpp
+        NoSlipD3Q27.h
+        NoSlipD3Q27.cpp
+        OutflowD3Q19.h
+        OutflowD3Q19.cpp
+        OutflowD3Q27.h
+        OutflowD3Q27.cpp
+        UBBD3Q19.h
+        UBBD3Q19.cpp
+        UBBD3Q27.h
+        UBBD3Q27.cpp
+    )
diff --git a/src/lbm_generated/boundary/D3Q19BoundaryCollection.h b/src/lbm_generated/boundary/D3Q19BoundaryCollection.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb1a23fb52be36ec0471bf05989512724acdc477
--- /dev/null
+++ b/src/lbm_generated/boundary/D3Q19BoundaryCollection.h
@@ -0,0 +1,123 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q19BoundaryCollection.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "OutflowD3Q19.h"
+#include "FixedDensityD3Q19.h"
+#include "FreeSlipD3Q19.h"
+#include "NoSlipD3Q19.h"
+#include "UBBD3Q19.h"
+
+
+
+namespace walberla{
+namespace lbm {
+
+template <typename FlagField_T>
+class D3Q19BoundaryCollection
+{
+ public:
+   enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+
+   D3Q19BoundaryCollection(const shared_ptr<StructuredBlockForest> & blocks, BlockDataID flagID_, BlockDataID pdfsID_, FlagUID domainUID_, double density, double u_x, double u_y, double u_z)
+      : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
+   {
+      OutflowD3Q19Object = std::make_shared< lbm::OutflowD3Q19 >(blocks, pdfsID);
+      FixedDensityD3Q19Object = std::make_shared< lbm::FixedDensityD3Q19 >(blocks, pdfsID, density);
+      FreeSlipD3Q19Object = std::make_shared< lbm::FreeSlipD3Q19 >(blocks, pdfsID);
+      NoSlipD3Q19Object = std::make_shared< lbm::NoSlipD3Q19 >(blocks, pdfsID);
+      UBBD3Q19Object = std::make_shared< lbm::UBBD3Q19 >(blocks, pdfsID, u_x, u_y, u_z);
+      
+
+      OutflowD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("Outflow"), domainUID);
+      FixedDensityD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FixedDensity"), domainUID);
+      FreeSlipD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FreeSlip"), domainUID);
+      NoSlipD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("NoSlip"), domainUID);
+      UBBD3Q19Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("UBB"), domainUID);
+      
+   }
+
+   void run (IBlock * block)
+   {
+      OutflowD3Q19Object->run(block);
+      FixedDensityD3Q19Object->run(block);
+      FreeSlipD3Q19Object->run(block);
+      NoSlipD3Q19Object->run(block);
+      UBBD3Q19Object->run(block);
+      
+   }
+
+   void inner (IBlock * block)
+   {
+      OutflowD3Q19Object->inner(block);
+      FixedDensityD3Q19Object->inner(block);
+      FreeSlipD3Q19Object->inner(block);
+      NoSlipD3Q19Object->inner(block);
+      UBBD3Q19Object->inner(block);
+      
+   }
+
+   void outer (IBlock * block)
+   {
+      OutflowD3Q19Object->outer(block);
+      FixedDensityD3Q19Object->outer(block);
+      FreeSlipD3Q19Object->outer(block);
+      NoSlipD3Q19Object->outer(block);
+      UBBD3Q19Object->outer(block);
+      
+   }
+
+   void operator() (IBlock * block)
+   {
+      run(block);
+   }
+
+   std::function<void (IBlock *)> getSweep(Type type = Type::ALL)
+   {
+      switch (type)
+      {
+      case Type::INNER:
+         return [this](IBlock* block) { this->inner(block); };
+      case Type::OUTER:
+         return [this](IBlock* block) { this->outer(block); };
+      default:
+         return [this](IBlock* block) { this->run(block); };
+      }
+   }
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+   BlockDataID flagID;
+   BlockDataID pdfsID;
+   walberla::FlagUID domainUID;
+
+   shared_ptr<lbm::OutflowD3Q19> OutflowD3Q19Object;
+   shared_ptr<lbm::FixedDensityD3Q19> FixedDensityD3Q19Object;
+   shared_ptr<lbm::FreeSlipD3Q19> FreeSlipD3Q19Object;
+   shared_ptr<lbm::NoSlipD3Q19> NoSlipD3Q19Object;
+   shared_ptr<lbm::UBBD3Q19> UBBD3Q19Object;
+   
+};
+
+}
+}
diff --git a/src/lbm_generated/boundary/D3Q27BoundaryCollection.h b/src/lbm_generated/boundary/D3Q27BoundaryCollection.h
new file mode 100644
index 0000000000000000000000000000000000000000..3428689bda22764cf3552e641d4c1f2656bab37a
--- /dev/null
+++ b/src/lbm_generated/boundary/D3Q27BoundaryCollection.h
@@ -0,0 +1,123 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q27BoundaryCollection.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+
+#include "OutflowD3Q27.h"
+#include "FixedDensityD3Q27.h"
+#include "FreeSlipD3Q27.h"
+#include "NoSlipD3Q27.h"
+#include "UBBD3Q27.h"
+
+
+
+namespace walberla{
+namespace lbm {
+
+template <typename FlagField_T>
+class D3Q27BoundaryCollection
+{
+ public:
+   enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+
+   D3Q27BoundaryCollection(const shared_ptr<StructuredBlockForest> & blocks, BlockDataID flagID_, BlockDataID pdfsID_, FlagUID domainUID_, double density, double u_x, double u_y, double u_z)
+      : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
+   {
+      OutflowD3Q27Object = std::make_shared< lbm::OutflowD3Q27 >(blocks, pdfsID);
+      FixedDensityD3Q27Object = std::make_shared< lbm::FixedDensityD3Q27 >(blocks, pdfsID, density);
+      FreeSlipD3Q27Object = std::make_shared< lbm::FreeSlipD3Q27 >(blocks, pdfsID);
+      NoSlipD3Q27Object = std::make_shared< lbm::NoSlipD3Q27 >(blocks, pdfsID);
+      UBBD3Q27Object = std::make_shared< lbm::UBBD3Q27 >(blocks, pdfsID, u_x, u_y, u_z);
+      
+
+      OutflowD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("Outflow"), domainUID);
+      FixedDensityD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FixedDensity"), domainUID);
+      FreeSlipD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("FreeSlip"), domainUID);
+      NoSlipD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("NoSlip"), domainUID);
+      UBBD3Q27Object->fillFromFlagField<FlagField_T>(blocks, flagID, walberla::FlagUID("UBB"), domainUID);
+      
+   }
+
+   void run (IBlock * block)
+   {
+      OutflowD3Q27Object->run(block);
+      FixedDensityD3Q27Object->run(block);
+      FreeSlipD3Q27Object->run(block);
+      NoSlipD3Q27Object->run(block);
+      UBBD3Q27Object->run(block);
+      
+   }
+
+   void inner (IBlock * block)
+   {
+      OutflowD3Q27Object->inner(block);
+      FixedDensityD3Q27Object->inner(block);
+      FreeSlipD3Q27Object->inner(block);
+      NoSlipD3Q27Object->inner(block);
+      UBBD3Q27Object->inner(block);
+      
+   }
+
+   void outer (IBlock * block)
+   {
+      OutflowD3Q27Object->outer(block);
+      FixedDensityD3Q27Object->outer(block);
+      FreeSlipD3Q27Object->outer(block);
+      NoSlipD3Q27Object->outer(block);
+      UBBD3Q27Object->outer(block);
+      
+   }
+
+   void operator() (IBlock * block)
+   {
+      run(block);
+   }
+
+   std::function<void (IBlock *)> getSweep(Type type = Type::ALL)
+   {
+      switch (type)
+      {
+      case Type::INNER:
+         return [this](IBlock* block) { this->inner(block); };
+      case Type::OUTER:
+         return [this](IBlock* block) { this->outer(block); };
+      default:
+         return [this](IBlock* block) { this->run(block); };
+      }
+   }
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+   BlockDataID flagID;
+   BlockDataID pdfsID;
+   walberla::FlagUID domainUID;
+
+   shared_ptr<lbm::OutflowD3Q27> OutflowD3Q27Object;
+   shared_ptr<lbm::FixedDensityD3Q27> FixedDensityD3Q27Object;
+   shared_ptr<lbm::FreeSlipD3Q27> FreeSlipD3Q27Object;
+   shared_ptr<lbm::NoSlipD3Q27> NoSlipD3Q27Object;
+   shared_ptr<lbm::UBBD3Q27> UBBD3Q27Object;
+   
+};
+
+}
+}
diff --git a/src/lbm_generated/boundary/FixedDensityD3Q19.cpp b/src/lbm_generated/boundary/FixedDensityD3Q19.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e449704f5a0bfa4932344fef2a8cab378770592f
--- /dev/null
+++ b/src/lbm_generated/boundary/FixedDensityD3Q19.cpp
@@ -0,0 +1,141 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FixedDensityD3Q19.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "FixedDensityD3Q19.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_fixeddensityd3q19_even {
+static FUNC_PREFIX void fixeddensityd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double density, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   const double rho = density;
+   const double delta_rho = rho - 1.0;
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      const double vel0Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3];
+      const double vel1Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3];
+      const double vel2Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 5*_stride_pdfs_3];
+      const double u_0 = vel0Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 3*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3];
+      const double u_1 = vel1Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 2*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3];
+      const double u_2 = vel2Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3];
+      const double u0Mu1 = u_0 + u_1*-1.0;
+      const double u0Pu1 = u_0 + u_1;
+      const double u1Pu2 = u_1 + u_2;
+      const double u1Mu2 = u_1 + u_2*-1.0;
+      const double u0Mu2 = u_0 + u_2*-1.0;
+      const double u0Pu2 = u_0 + u_2;
+      const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2);
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = -1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir] + 2.0*((((dir) == (0))) ? (f_eq_common*0.33333333333333331): ((((dir) == (1)) || ((dir) == (2))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_1*u_1)): ((((dir) == (3)) || ((dir) == (4))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_0*u_0)): ((((dir) == (5)) || ((dir) == (6))) ? (delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + 0.33333333333333331*(u_2*u_2)): ((((dir) == (7))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)): ((((dir) == (8)) || ((dir) == (9))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)): ((((dir) == (10))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)): ((((dir) == (11))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)): ((((dir) == (12))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)): ((((dir) == (13))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)): ((((dir) == (14))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)): ((((dir) == (15))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)): ((((dir) == (16))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)): ((((dir) == (17))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)): ((((dir) == (18))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)): (0.0))))))))))))))));
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void FixedDensityD3Q19::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   auto & density = density_;
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_fixeddensityd3q19_even::fixeddensityd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize);
+    } else {
+        internal_fixeddensityd3q19_even::fixeddensityd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize);
+    }
+}
+
+void FixedDensityD3Q19::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void FixedDensityD3Q19::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void FixedDensityD3Q19::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/FixedDensityD3Q19.h b/src/lbm_generated/boundary/FixedDensityD3Q19.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4575d189724633c503fc0ba94a004c5b07ef9c2
--- /dev/null
+++ b/src/lbm_generated/boundary/FixedDensityD3Q19.h
@@ -0,0 +1,509 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FixedDensityD3Q19.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class FixedDensityD3Q19
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    FixedDensityD3Q19( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_, double density)
+        : pdfsID(pdfsID_), density_(density)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FixedDensityD3Q19");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+    double density_;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/FixedDensityD3Q27.cpp b/src/lbm_generated/boundary/FixedDensityD3Q27.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ff43bc5efa34a0ba88e8205440f46e5fa6db94b
--- /dev/null
+++ b/src/lbm_generated/boundary/FixedDensityD3Q27.cpp
@@ -0,0 +1,140 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FixedDensityD3Q27.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "FixedDensityD3Q27.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_fixeddensityd3q27_even {
+static FUNC_PREFIX void fixeddensityd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double density, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   const double rho = density;
+   const double delta_rho = rho - 1.0;
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      const double vel0Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 4*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3];
+      const double vel1Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3];
+      const double vel2Term = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 5*_stride_pdfs_3];
+      const double u_0 = vel0Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 13*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 3*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 7*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3];
+      const double u_1 = vel1Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 10*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 12*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 2*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 22*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 9*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 8*_stride_pdfs_3];
+      const double u_2 = vel2Term - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 15*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 16*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 17*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 18*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 23*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 24*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 25*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 26*_stride_pdfs_3] - 1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 6*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 11*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 14*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 19*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 20*_stride_pdfs_3] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + 21*_stride_pdfs_3];
+      const double u0Mu1 = u_0 + u_1*-1.0;
+      const double u0Pu1 = u_0 + u_1;
+      const double u1Pu2 = u_1 + u_2;
+      const double u1Mu2 = u_1 + u_2*-1.0;
+      const double u0Mu2 = u_0 + u_2*-1.0;
+      const double u0Pu2 = u_0 + u_2;
+      const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2);
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = -1.0*_data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir] + 2.0*((((dir) == (0))) ? (f_eq_common*0.29629629629629628): ((((dir) == (1)) || ((dir) == (2))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_1*u_1)): ((((dir) == (3)) || ((dir) == (4))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_0*u_0)): ((((dir) == (5)) || ((dir) == (6))) ? (f_eq_common*0.07407407407407407 + 0.33333333333333331*(u_2*u_2)): ((((dir) == (7))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu1*u0Mu1)): ((((dir) == (8)) || ((dir) == (9))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu1*u0Pu1)): ((((dir) == (10))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu1*u0Mu1)): ((((dir) == (11))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Pu2*u1Pu2)): ((((dir) == (12))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Mu2*u1Mu2)): ((((dir) == (13))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu2*u0Mu2)): ((((dir) == (14))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu2*u0Pu2)): ((((dir) == (15))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Mu2*u1Mu2)): ((((dir) == (16))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u1Pu2*u1Pu2)): ((((dir) == (17))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Pu2*u0Pu2)): ((((dir) == (18))) ? (f_eq_common*0.018518518518518517 + 0.083333333333333329*(u0Mu2*u0Mu2)): ((((dir) == (19))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (20))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (21))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (22)) || ((dir) == (23))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (24))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)): ((((dir) == (25))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): ((((dir) == (26))) ? (delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)): (0.0)))))))))))))))))))))));
+   }
+}
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void FixedDensityD3Q27::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   auto & density = density_;
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_fixeddensityd3q27_even::fixeddensityd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize);
+    } else {
+        internal_fixeddensityd3q27_even::fixeddensityd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, density, indexVectorSize);
+    }
+}
+
+void FixedDensityD3Q27::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void FixedDensityD3Q27::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void FixedDensityD3Q27::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/FixedDensityD3Q27.h b/src/lbm_generated/boundary/FixedDensityD3Q27.h
new file mode 100644
index 0000000000000000000000000000000000000000..359540d25af2be0c78b85ad591c27aaba8d48de8
--- /dev/null
+++ b/src/lbm_generated/boundary/FixedDensityD3Q27.h
@@ -0,0 +1,645 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FixedDensityD3Q27.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class FixedDensityD3Q27
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    FixedDensityD3Q27( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_, double density)
+        : pdfsID(pdfsID_), density_(density)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FixedDensityD3Q27");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  19 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  20 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  21 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  22 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  23 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  24 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  25 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  26 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+    double density_;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/FreeSlipD3Q19.cpp b/src/lbm_generated/boundary/FreeSlipD3Q19.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e3dc46580b5cbd0bdf533dd33742986ab13cd7f
--- /dev/null
+++ b/src/lbm_generated/boundary/FreeSlipD3Q19.cpp
@@ -0,0 +1,132 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FreeSlipD3Q19.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "FreeSlipD3Q19.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_freeslipd3q19_even {
+static FUNC_PREFIX void freeslipd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 16])) + neighbour_offset_x[dir]) + _stride_pdfs_1*y + _stride_pdfs_1*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 20])) + neighbour_offset_y[dir]) + _stride_pdfs_2*z + _stride_pdfs_2*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 24])) + neighbour_offset_z[dir]) + _stride_pdfs_3**((int32_t * )(& _data_indexVector[32*ctr_0 + 28]))];
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void FreeSlipD3Q19::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_freeslipd3q19_even::freeslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_freeslipd3q19_even::freeslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void FreeSlipD3Q19::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void FreeSlipD3Q19::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void FreeSlipD3Q19::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/FreeSlipD3Q19.h b/src/lbm_generated/boundary/FreeSlipD3Q19.h
new file mode 100644
index 0000000000000000000000000000000000000000..4679ffc4ff0cbf7cc5bfb07d1a9f9d9a7e775e2e
--- /dev/null
+++ b/src/lbm_generated/boundary/FreeSlipD3Q19.h
@@ -0,0 +1,1101 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FreeSlipD3Q19.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class FreeSlipD3Q19
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        int32_t wnx;
+        int32_t wny;
+        int32_t wnz;
+        int32_t ref_dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), wnx(), wny(), wnz(), ref_dir() {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir && wnx == o.wnx && wny == o.wny && wnz == o.wnz && ref_dir == o.ref_dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    FreeSlipD3Q19( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FreeSlipD3Q19");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 0, 0);
+                int32_t ref_dir = 0; // dir: 0
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 0;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 1, 0);
+                int32_t ref_dir = 2; // dir: 1
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 1;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, -1, 0);
+                int32_t ref_dir = 1; // dir: 2
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 2;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(-1, 0, 0);
+                int32_t ref_dir = 4; // dir: 3
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 3;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(1, 0, 0);
+                int32_t ref_dir = 3; // dir: 4
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 4;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 0, 1);
+                int32_t ref_dir = 6; // dir: 5
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 5;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 0, -1);
+                int32_t ref_dir = 5; // dir: 6
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 6;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(-1, 1, 0);
+                int32_t ref_dir = 10; // dir: 7
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 7;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(1, 1, 0);
+                int32_t ref_dir = 9; // dir: 8
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 8;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(-1, -1, 0);
+                int32_t ref_dir = 8; // dir: 9
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 9;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(1, -1, 0);
+                int32_t ref_dir = 7; // dir: 10
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 10;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 1, 1);
+                int32_t ref_dir = 16; // dir: 11
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = -1;
+                   ref_dir = 11;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, -1, 1);
+                int32_t ref_dir = 15; // dir: 12
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = -1;
+                   ref_dir = 12;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(-1, 0, 1);
+                int32_t ref_dir = 18; // dir: 13
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 13;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(1, 0, 1);
+                int32_t ref_dir = 17; // dir: 14
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 14;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, 1, -1);
+                int32_t ref_dir = 12; // dir: 15
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = 1;
+                   ref_dir = 15;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(0, -1, -1);
+                int32_t ref_dir = 11; // dir: 16
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = 1;
+                   ref_dir = 16;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(-1, 0, -1);
+                int32_t ref_dir = 14; // dir: 17
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 17;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14 };
+                const Cell n = it.cell() + Cell(1, 0, -1);
+                int32_t ref_dir = 13; // dir: 18
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 18;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/FreeSlipD3Q27.cpp b/src/lbm_generated/boundary/FreeSlipD3Q27.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3364610eec662874c88832e7ebedd144755ccf1a
--- /dev/null
+++ b/src/lbm_generated/boundary/FreeSlipD3Q27.cpp
@@ -0,0 +1,132 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FreeSlipD3Q27.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "FreeSlipD3Q27.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_freeslipd3q27_even {
+static FUNC_PREFIX void freeslipd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 16])) + neighbour_offset_x[dir]) + _stride_pdfs_1*y + _stride_pdfs_1*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 20])) + neighbour_offset_y[dir]) + _stride_pdfs_2*z + _stride_pdfs_2*(*((int32_t * )(& _data_indexVector[32*ctr_0 + 24])) + neighbour_offset_z[dir]) + _stride_pdfs_3**((int32_t * )(& _data_indexVector[32*ctr_0 + 28]))];
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void FreeSlipD3Q27::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_freeslipd3q27_even::freeslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_freeslipd3q27_even::freeslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void FreeSlipD3Q27::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void FreeSlipD3Q27::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void FreeSlipD3Q27::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/FreeSlipD3Q27.h b/src/lbm_generated/boundary/FreeSlipD3Q27.h
new file mode 100644
index 0000000000000000000000000000000000000000..562dfbcadd6e98f88ece133ab724080f3488b77e
--- /dev/null
+++ b/src/lbm_generated/boundary/FreeSlipD3Q27.h
@@ -0,0 +1,1485 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file FreeSlipD3Q27.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class FreeSlipD3Q27
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        int32_t wnx;
+        int32_t wny;
+        int32_t wnz;
+        int32_t ref_dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), wnx(), wny(), wnz(), ref_dir() {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir && wnx == o.wnx && wny == o.wny && wnz == o.wnz && ref_dir == o.ref_dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    FreeSlipD3Q27( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_FreeSlipD3Q27");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 0, 0);
+                int32_t ref_dir = 0; // dir: 0
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 0;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 1, 0);
+                int32_t ref_dir = 2; // dir: 1
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 1;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, -1, 0);
+                int32_t ref_dir = 1; // dir: 2
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 2;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 0, 0);
+                int32_t ref_dir = 4; // dir: 3
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 3;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 0, 0);
+                int32_t ref_dir = 3; // dir: 4
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = 0;
+                   ref_dir = 4;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 0, 1);
+                int32_t ref_dir = 6; // dir: 5
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 5;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 0, -1);
+                int32_t ref_dir = 5; // dir: 6
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 6;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 1, 0);
+                int32_t ref_dir = 10; // dir: 7
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 7;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 1, 0);
+                int32_t ref_dir = 9; // dir: 8
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = -1;
+                   element.wnz = 0;
+                   ref_dir = 8;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, -1, 0);
+                int32_t ref_dir = 8; // dir: 9
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 9;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, -1, 0);
+                int32_t ref_dir = 7; // dir: 10
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 0, domainFlag ) )
+                {
+                   element.wnz = 0;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 1;
+                   element.wnz = 0;
+                   ref_dir = 10;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 1, 1);
+                int32_t ref_dir = 16; // dir: 11
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = -1;
+                   ref_dir = 11;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, -1, 1);
+                int32_t ref_dir = 15; // dir: 12
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = -1;
+                   ref_dir = 12;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 0, 1);
+                int32_t ref_dir = 18; // dir: 13
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 13;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 0, 1);
+                int32_t ref_dir = 17; // dir: 14
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = -1;
+                   ref_dir = 14;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, 1, -1);
+                int32_t ref_dir = 12; // dir: 15
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = -1;
+                   element.wnz = 1;
+                   ref_dir = 15;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(0, -1, -1);
+                int32_t ref_dir = 11; // dir: 16
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 0, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 0;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 0;
+                   element.wny = 1;
+                   element.wnz = 1;
+                   ref_dir = 16;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 0, -1);
+                int32_t ref_dir = 14; // dir: 17
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 17;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 0, -1);
+                int32_t ref_dir = 13; // dir: 18
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 0, n.z(), domainFlag ) )
+                {
+                   element.wny = 0;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 0;
+                   element.wnz = 1;
+                   ref_dir = 18;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  19 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 1, 1);
+                int32_t ref_dir = 26; // dir: 19
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = -1;
+                   element.wnz = -1;
+                   ref_dir = 19;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  20 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 1, 1);
+                int32_t ref_dir = 25; // dir: 20
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = -1;
+                   element.wnz = -1;
+                   ref_dir = 20;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  21 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, -1, 1);
+                int32_t ref_dir = 24; // dir: 21
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 1;
+                   element.wnz = -1;
+                   ref_dir = 21;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  22 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, -1, 1);
+                int32_t ref_dir = 23; // dir: 22
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + -1, domainFlag ) )
+                {
+                   element.wnz = -1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 1;
+                   element.wnz = -1;
+                   ref_dir = 22;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  23 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, 1, -1);
+                int32_t ref_dir = 22; // dir: 23
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = -1;
+                   element.wnz = 1;
+                   ref_dir = 23;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  24 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, 1, -1);
+                int32_t ref_dir = 21; // dir: 24
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + -1, n.z(), domainFlag ) )
+                {
+                   element.wny = -1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = -1;
+                   element.wnz = 1;
+                   ref_dir = 24;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  25 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(1, -1, -1);
+                int32_t ref_dir = 20; // dir: 25
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + -1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = -1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = -1;
+                   element.wny = 1;
+                   element.wnz = 1;
+                   ref_dir = 25;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  26 );
+              const int32_t x_axis_mirrored_stencil_dir [] = { 0,1,2,4,3,5,6,8,7,10,9,11,12,14,13,15,16,18,17,20,19,22,21,24,23,26,25 };
+                const int32_t y_axis_mirrored_stencil_dir [] = { 0,2,1,3,4,5,6,9,10,7,8,12,11,13,14,16,15,17,18,21,22,19,20,25,26,23,24 };
+                const int32_t z_axis_mirrored_stencil_dir [] = { 0,1,2,3,4,6,5,7,8,9,10,15,16,17,18,11,12,13,14,23,24,25,26,19,20,21,22 };
+                const Cell n = it.cell() + Cell(-1, -1, -1);
+                int32_t ref_dir = 19; // dir: 26
+                element.wnx = 0; // compute discrete normal vector of free slip wall
+                element.wny = 0;
+                if( flagField->isPartOfMaskSet( n.x() + 1, n.y(), n.z(), domainFlag ) )
+                {
+                   element.wnx = 1;
+                   ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                if( flagField->isPartOfMaskSet( n.x(), n.y() + 1, n.z(), domainFlag ) )
+                {
+                   element.wny = 1;
+                   ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                element.wnz = 0;
+                if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + 1, domainFlag ) )
+                {
+                   element.wnz = 1;
+                   ref_dir = z_axis_mirrored_stencil_dir[ ref_dir ];
+                }
+                // concave corner (neighbors are non-fluid)
+                if( element.wnx == 0 && element.wny == 0 && element.wnz == 0 )
+                {
+                   element.wnx = 1;
+                   element.wny = 1;
+                   element.wnz = 1;
+                   ref_dir = 26;
+                }
+                element.ref_dir = ref_dir;
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/NoSlipD3Q19.cpp b/src/lbm_generated/boundary/NoSlipD3Q19.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..268cbf43361645c8e7886f6abd86a56089a75fff
--- /dev/null
+++ b/src/lbm_generated/boundary/NoSlipD3Q19.cpp
@@ -0,0 +1,125 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file NoSlipD3Q19.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "NoSlipD3Q19.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_noslipd3q19_even {
+static FUNC_PREFIX void noslipd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir];
+   }
+}
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void NoSlipD3Q19::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_noslipd3q19_even::noslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_noslipd3q19_even::noslipd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void NoSlipD3Q19::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void NoSlipD3Q19::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void NoSlipD3Q19::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/NoSlipD3Q19.h b/src/lbm_generated/boundary/NoSlipD3Q19.h
new file mode 100644
index 0000000000000000000000000000000000000000..933108eec5fdcdeee8e0af6abb90617fc149307e
--- /dev/null
+++ b/src/lbm_generated/boundary/NoSlipD3Q19.h
@@ -0,0 +1,508 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file NoSlipD3Q19.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class NoSlipD3Q19
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    NoSlipD3Q19( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_NoSlipD3Q19");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/NoSlipD3Q27.cpp b/src/lbm_generated/boundary/NoSlipD3Q27.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c38bee8122daa4ee1d09b1b861e5729d232bf310
--- /dev/null
+++ b/src/lbm_generated/boundary/NoSlipD3Q27.cpp
@@ -0,0 +1,126 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file NoSlipD3Q27.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "NoSlipD3Q27.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_noslipd3q27_even {
+static FUNC_PREFIX void noslipd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir];
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void NoSlipD3Q27::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_noslipd3q27_even::noslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_noslipd3q27_even::noslipd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void NoSlipD3Q27::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void NoSlipD3Q27::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void NoSlipD3Q27::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/NoSlipD3Q27.h b/src/lbm_generated/boundary/NoSlipD3Q27.h
new file mode 100644
index 0000000000000000000000000000000000000000..56bbfb0611d6a506b3ed4558c388b3d9ed65d443
--- /dev/null
+++ b/src/lbm_generated/boundary/NoSlipD3Q27.h
@@ -0,0 +1,644 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file NoSlipD3Q27.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class NoSlipD3Q27
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    NoSlipD3Q27( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_NoSlipD3Q27");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  19 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  20 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  21 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  22 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  23 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  24 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  25 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  26 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/OutflowD3Q19.cpp b/src/lbm_generated/boundary/OutflowD3Q19.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d42cf90429601d5ed4809c30b8926548d8bf6618
--- /dev/null
+++ b/src/lbm_generated/boundary/OutflowD3Q19.cpp
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file OutflowD3Q19.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "OutflowD3Q19.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_outflowd3q19_even {
+static FUNC_PREFIX void outflowd3q19_even(const uint8_t * RESTRICT  _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_out_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12]));
+      const double pdf_inter = 0.42264973081037427**((double * )(& _data_indexVector[32*ctr_0 + 24])) + 0.57735026918962573**((double * )(& _data_indexVector[32*ctr_0 + 16]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = pdf_inter;
+      *((double * )(& _data_indexVector[32*ctr_0 + 16])) = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(neighbour_offset_x[dir] - 1) + _stride_pdfs_1*y + _stride_pdfs_1*neighbour_offset_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*neighbour_offset_z[dir] + _stride_pdfs_3*f_out_inv_dir_idx[dir]];
+      *((double * )(& _data_indexVector[32*ctr_0 + 24])) = pdf_inter;
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void OutflowD3Q19::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_outflowd3q19_even::outflowd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_outflowd3q19_even::outflowd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void OutflowD3Q19::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void OutflowD3Q19::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void OutflowD3Q19::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/OutflowD3Q19.h b/src/lbm_generated/boundary/OutflowD3Q19.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb2999966556997e70c9f469e65062951276a601
--- /dev/null
+++ b/src/lbm_generated/boundary/OutflowD3Q19.h
@@ -0,0 +1,277 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file OutflowD3Q19.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class OutflowD3Q19
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        double pdf;
+        double pdf_nd;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), pdf(), pdf_nd() {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir && floatIsEqual(pdf, o.pdf) && floatIsEqual(pdf_nd, o.pdf_nd);
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    OutflowD3Q19( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_OutflowD3Q19");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        auto pdfs = block->getData< field::GhostLayerField<real_t, 19> >(pdfsID); 
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    BlockDataID pdfsCPUID;
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/OutflowD3Q27.cpp b/src/lbm_generated/boundary/OutflowD3Q27.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ec9a490b443740ff1ae24adfa1a1739261311a2
--- /dev/null
+++ b/src/lbm_generated/boundary/OutflowD3Q27.cpp
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file OutflowD3Q27.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "OutflowD3Q27.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_outflowd3q27_even {
+static FUNC_PREFIX void outflowd3q27_even(const uint8_t * RESTRICT  _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize)
+{
+   
+   const int32_t f_out_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[32*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[32*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[32*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[32*ctr_0 + 12]));
+      const double pdf_inter = 0.42264973081037427**((double * )(& _data_indexVector[32*ctr_0 + 24])) + 0.57735026918962573**((double * )(& _data_indexVector[32*ctr_0 + 16]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = pdf_inter;
+      *((double * )(& _data_indexVector[32*ctr_0 + 16])) = _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*(neighbour_offset_x[dir] - 1) + _stride_pdfs_1*y + _stride_pdfs_1*neighbour_offset_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*neighbour_offset_z[dir] + _stride_pdfs_3*f_out_inv_dir_idx[dir]];
+      *((double * )(& _data_indexVector[32*ctr_0 + 24])) = pdf_inter;
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void OutflowD3Q27::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_outflowd3q27_even::outflowd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    } else {
+        internal_outflowd3q27_even::outflowd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize);
+    }
+}
+
+void OutflowD3Q27::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void OutflowD3Q27::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void OutflowD3Q27::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/OutflowD3Q27.h b/src/lbm_generated/boundary/OutflowD3Q27.h
new file mode 100644
index 0000000000000000000000000000000000000000..53b4e4bae5e6c6da6b4b108751120bf90a5ab25b
--- /dev/null
+++ b/src/lbm_generated/boundary/OutflowD3Q27.h
@@ -0,0 +1,349 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file OutflowD3Q27.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class OutflowD3Q27
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        double pdf;
+        double pdf_nd;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_), pdf(), pdf_nd() {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir && floatIsEqual(pdf, o.pdf) && floatIsEqual(pdf_nd, o.pdf_nd);
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    OutflowD3Q27( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_)
+        : pdfsID(pdfsID_)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_OutflowD3Q27");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        auto pdfs = block->getData< field::GhostLayerField<real_t, 27> >(pdfsID); 
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(0), 3);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(0), 9);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(0), 7);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(1), 17);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(0), it.z() + cell_idx_c(-1), 13);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  19 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(1), 26);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(1), 26);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  21 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(1), 24);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(1), 24);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  23 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(-1), 22);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(1), it.z() + cell_idx_c(-1), 22);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  25 );
+              element.pdf = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1), 20);
+                element.pdf_nd = pdfs->get(it.x() + cell_idx_c(0), it.y() + cell_idx_c(-1), it.z() + cell_idx_c(-1), 20);
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    BlockDataID pdfsCPUID;
+public:
+    BlockDataID pdfsID;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/UBBD3Q19.cpp b/src/lbm_generated/boundary/UBBD3Q19.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a88d2feeff0237881df80f6494a4f58f8936e02
--- /dev/null
+++ b/src/lbm_generated/boundary/UBBD3Q19.cpp
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UBBD3Q19.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UBBD3Q19.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_ubbd3q19_even {
+static FUNC_PREFIX void ubbd3q19_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize, double u_x, double u_y, double u_z)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   const double weights [] = {0.33333333333333333, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.055555555555555556, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778, 0.027777777777777778};
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = (u_x*6.0*((double)(neighbour_offset_x[dir])) + u_y*6.0*((double)(neighbour_offset_y[dir])) + u_z*6.0*((double)(neighbour_offset_z[dir])))*-1.0*weights[dir] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir];
+   }
+}
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void UBBD3Q19::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   auto & u_y = u_y_;
+    auto & u_x = u_x_;
+    auto & u_z = u_z_;
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_ubbd3q19_even::ubbd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z);
+    } else {
+        internal_ubbd3q19_even::ubbd3q19_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z);
+    }
+}
+
+void UBBD3Q19::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void UBBD3Q19::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void UBBD3Q19::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/UBBD3Q19.h b/src/lbm_generated/boundary/UBBD3Q19.h
new file mode 100644
index 0000000000000000000000000000000000000000..f57bac12d404b9b3d8819d7955dc65c3cdbcab61
--- /dev/null
+++ b/src/lbm_generated/boundary/UBBD3Q19.h
@@ -0,0 +1,511 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UBBD3Q19.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class UBBD3Q19
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    UBBD3Q19( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_, double u_x, double u_y, double u_z)
+        : pdfsID(pdfsID_), u_x_(u_x), u_y_(u_y), u_z_(u_z)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UBBD3Q19");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+    double u_x_;
+    double u_y_;
+    double u_z_;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/UBBD3Q27.cpp b/src/lbm_generated/boundary/UBBD3Q27.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08ee3ef38ef4460b590216b789caea5457da8b97
--- /dev/null
+++ b/src/lbm_generated/boundary/UBBD3Q27.cpp
@@ -0,0 +1,137 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UBBD3Q27.cpp
+//! \\author pystencils
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "UBBD3Q27.h"
+
+
+
+#define FUNC_PREFIX
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+#pragma nv_diag_suppress 177
+#else
+#pragma diag_suppress 177
+#endif
+#endif
+
+namespace internal_ubbd3q27_even {
+static FUNC_PREFIX void ubbd3q27_even(const uint8_t * RESTRICT const _data_indexVector, double * RESTRICT  _data_pdfs, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int32_t indexVectorSize, double u_x, double u_y, double u_z)
+{
+   
+   const int32_t f_in_inv_dir_idx [] = { 0,2,1,4,3,6,5,10,9,8,7,16,15,18,17,12,11,14,13,26,25,24,23,22,21,20,19 }; 
+   const int32_t f_in_inv_offsets_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t f_in_inv_offsets_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t f_in_inv_offsets_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   
+   const double weights [] = {0.29629629629629630, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.074074074074074074, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.018518518518518519, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296, 0.0046296296296296296};
+   
+   
+   
+   const int32_t neighbour_offset_x [] = { 0,0,0,-1,1,0,0,-1,1,-1,1,0,0,-1,1,0,0,-1,1,1,-1,1,-1,1,-1,1,-1 }; 
+   const int32_t neighbour_offset_y [] = { 0,1,-1,0,0,0,0,1,1,-1,-1,1,-1,0,0,1,-1,0,0,1,1,-1,-1,1,1,-1,-1 }; 
+   const int32_t neighbour_offset_z [] = { 0,0,0,0,0,1,-1,0,0,0,0,1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1 }; 
+   
+   for (int64_t ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
+   {
+      const int32_t x = *((int32_t * )(& _data_indexVector[16*ctr_0]));
+      const int32_t y = *((int32_t * )(& _data_indexVector[16*ctr_0 + 4]));
+      const int32_t z = *((int32_t * )(& _data_indexVector[16*ctr_0 + 8]));
+      const int32_t dir = *((int32_t * )(& _data_indexVector[16*ctr_0 + 12]));
+      _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_0*f_in_inv_offsets_x[dir] + _stride_pdfs_1*y + _stride_pdfs_1*f_in_inv_offsets_y[dir] + _stride_pdfs_2*z + _stride_pdfs_2*f_in_inv_offsets_z[dir] + _stride_pdfs_3*f_in_inv_dir_idx[dir]] = (u_x*6.0*((double)(neighbour_offset_x[dir])) + u_y*6.0*((double)(neighbour_offset_y[dir])) + u_z*6.0*((double)(neighbour_offset_z[dir])))*-1.0*weights[dir] + _data_pdfs[_stride_pdfs_0*x + _stride_pdfs_1*y + _stride_pdfs_2*z + _stride_pdfs_3*dir];
+   }
+}
+}
+
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void UBBD3Q27::run_impl(IBlock * block, IndexVectors::Type type)
+{
+   auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+   int32_t indexVectorSize = int32_c( indexVectors->indexVector(type).size() );
+   if( indexVectorSize == 0)
+      return;
+
+   
+   auto pointer = indexVectors->pointerCpu(type);
+   
+
+   uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+   auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+   uint8_t timestep = pdfs->getTimestep();
+   auto & u_y = u_y_;
+    auto & u_x = u_x_;
+    auto & u_z = u_z_;
+   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(pdfs->nrOfGhostLayers()))
+    double * RESTRICT  _data_pdfs = pdfs->dataAt(0, 0, 0, 0);
+    const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+    const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+    const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+    const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+    if(((timestep & 1) ^ 1)) {
+        internal_ubbd3q27_even::ubbd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z);
+    } else {
+        internal_ubbd3q27_even::ubbd3q27_even(_data_indexVector, _data_pdfs, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, indexVectorSize, u_x, u_y, u_z);
+    }
+}
+
+void UBBD3Q27::run(IBlock * block)
+{
+   run_impl(block, IndexVectors::ALL);
+}
+
+void UBBD3Q27::inner(IBlock * block)
+{
+   run_impl(block, IndexVectors::INNER);
+}
+
+void UBBD3Q27::outer(IBlock * block)
+{
+   run_impl(block, IndexVectors::OUTER);
+}
+
+} // namespace lbm
+} // namespace walberla
+
diff --git a/src/lbm_generated/boundary/UBBD3Q27.h b/src/lbm_generated/boundary/UBBD3Q27.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7836d6958677e9b221f74f37b014b3de35019c7
--- /dev/null
+++ b/src/lbm_generated/boundary/UBBD3Q27.h
@@ -0,0 +1,647 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file UBBD3Q27.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+#include "core/debug/Debug.h"
+
+#include <set>
+#include <vector>
+
+
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class UBBD3Q27
+{
+public:
+    struct IndexInfo { 
+        int32_t x;
+        int32_t y;
+        int32_t z;
+        int32_t dir;
+        IndexInfo(int32_t x_, int32_t y_, int32_t z_, int32_t dir_) : x(x_), y(y_), z(z_), dir(dir_) {}
+        bool operator==(const IndexInfo & o) const {
+            return x == o.x && y == o.y && z == o.z && dir == o.dir;
+        }
+    };
+
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<IndexInfo>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() = default;
+        bool operator==(IndexVectors const &other) const { return other.cpuVectors_ == cpuVectors_; }
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        IndexInfo * pointerCpu(Type t)  { return cpuVectors_[t].data(); }
+
+        void syncGPU()
+        {
+            
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_{NUM_TYPES};
+
+        
+    };
+
+    UBBD3Q27( const shared_ptr<StructuredBlockForest> & blocks,
+                   BlockDataID pdfsID_, double u_x, double u_y, double u_z)
+        : pdfsID(pdfsID_), u_x_(u_x), u_y_(u_y), u_z_(u_z)
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_UBBD3Q27");
+    };
+
+    void run (IBlock * block);
+
+    void operator() (IBlock * block)
+    {
+        run(block);
+    }
+
+    void inner (IBlock * block);
+
+    void outer (IBlock * block);
+
+    std::function<void (IBlock *)> getSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->run(b); };
+    }
+
+    std::function<void (IBlock *)> getInnerSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->inner(b); };
+    }
+
+    std::function<void (IBlock *)> getOuterSweep()
+    {
+        return [this]
+               (IBlock * b)
+               { this->outer(b); };
+    }
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>(&*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField(IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+        
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  0 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  1 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  2 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  3 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  4 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  5 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  6 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  7 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  8 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  9 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 0 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  10 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  11 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  12 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  13 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  14 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  15 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(0, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  16 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  17 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 0, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  18 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  19 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  20 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  21 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, 1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  22 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  23 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, 1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  24 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  25 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        for( auto it = flagField->beginWithGhostLayerXYZ( cell_idx_c( flagField->nrOfGhostLayers() - 1 ) ); it != flagField->end(); ++it )
+        {
+           if( ! isFlagSet(it, domainFlag) )
+              continue;
+
+           if ( isFlagSet( it.neighbor(-1, -1, -1 , 0 ), boundaryFlag ) )
+           {
+              auto element = IndexInfo(it.x(), it.y(),  it.z(),  26 );
+              
+              indexVectorAll.push_back( element );
+              if( inner.contains( it.x(), it.y(), it.z() ) )
+                 indexVectorInner.push_back( element );
+              else
+                 indexVectorOuter.push_back( element );
+           }
+        }
+        
+        
+        
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run_impl(IBlock * block, IndexVectors::Type type);
+
+    BlockDataID indexVectorID;
+    
+public:
+    BlockDataID pdfsID;
+    double u_x_;
+    double u_y_;
+    double u_z_;
+};
+
+
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/boundary/boundary_generation_script.py b/src/lbm_generated/boundary/boundary_generation_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..970daedc56e562b2aba05fa6e9147ad7c889cda0
--- /dev/null
+++ b/src/lbm_generated/boundary/boundary_generation_script.py
@@ -0,0 +1,55 @@
+import sympy as sp
+
+from pystencils import Target
+
+from lbmpy.creationfunctions import create_lb_method
+from lbmpy import LBMConfig, Stencil, Method, LBStencil
+from lbmpy.boundaries import ExtrapolationOutflow, FixedDensity, FreeSlip, NoSlip, UBB
+
+from pystencils_walberla import ManualCodeGenerationContext, generate_info_header
+from lbmpy_walberla.boundary_collection import generate_boundary_collection
+from lbmpy_walberla import lbm_boundary_generator
+
+with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False,
+                                 mpi=True, double_accuracy=True, cuda=False) as ctx:
+
+    for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]:
+        target = Target.GPU if ctx.cuda else Target.CPU
+        data_type = "float64" if ctx.double_accuracy else "float32"
+
+        method = Method.SRT
+        relaxation_rate = sp.symbols("omega")
+        streaming_pattern = 'pull'
+
+        lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate,
+                               streaming_pattern=streaming_pattern)
+
+        lb_method = create_lb_method(lbm_config=lbm_config)
+
+        outflow_west_boundary = ExtrapolationOutflow(normal_direction=(1, 0, 0), lb_method=lb_method)
+        fixed_density_boundary = FixedDensity(density=sp.Symbol("density"))
+        free_slip_boundary = FreeSlip(stencil)
+        no_slip_boundary = NoSlip()
+        ubb_boundary = UBB(sp.symbols("u_x, u_y, u_z"), data_type=data_type)
+
+        outflow = lbm_boundary_generator(class_name=f'Outflow{stencil.name}', flag_uid='Outflow',
+                                         boundary_object=outflow_west_boundary)
+
+        fixed_density = lbm_boundary_generator(class_name=f'FixedDensity{stencil.name}', flag_uid='FixedDensity',
+                                               boundary_object=fixed_density_boundary)
+
+        free_slip = lbm_boundary_generator(class_name=f'FreeSlip{stencil.name}', flag_uid='FreeSlip',
+                                           boundary_object=free_slip_boundary)
+
+        no_slip = lbm_boundary_generator(class_name=f'NoSlip{stencil.name}', flag_uid='NoSlip',
+                                         boundary_object=no_slip_boundary)
+
+        ubb = lbm_boundary_generator(class_name=f'UBB{stencil.name}', flag_uid='UBB',
+                                     boundary_object=ubb_boundary)
+
+        boundaries = [outflow, fixed_density, free_slip, no_slip, ubb]
+        generate_boundary_collection(ctx, f'{stencil.name}BoundaryCollection', boundary_generators=boundaries,
+                                     lb_method=lb_method, streaming_pattern=streaming_pattern,
+                                     target=target)
+
+        ctx.write_all_files()
diff --git a/src/lbm_generated/communication/CMakeLists.txt b/src/lbm_generated/communication/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd5516b9e96a757f9d8911269d7f703b13e92105
--- /dev/null
+++ b/src/lbm_generated/communication/CMakeLists.txt
@@ -0,0 +1,9 @@
+target_sources( lbm_generated
+    PRIVATE
+    CombinedInPlacePackInfo.h
+    NonuniformCommData.h
+    NonuniformCommData.impl.h
+    NonuniformGeneratedPdfPackInfo.h
+    NonuniformGeneratedPdfPackInfo.impl.h
+    UniformGeneratedPdfPackInfo.h
+    )
diff --git a/src/lbm_generated/communication/CombinedInPlacePackInfo.h b/src/lbm_generated/communication/CombinedInPlacePackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5a4c0ba2fd0fc6a9b2816ecbe38cff2af1dd150
--- /dev/null
+++ b/src/lbm_generated/communication/CombinedInPlacePackInfo.h
@@ -0,0 +1,117 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CombinedInPlacePackInfo.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+#include "communication/UniformPackInfo.h"
+
+namespace walberla::lbm_generated {
+
+template< typename LatticeStorageSpecification_T, typename EvenPackInfo, typename OddPackInfo >
+class CombinedInPlaceCpuPackInfo : public ::walberla::communication::UniformPackInfo
+{
+ public:
+   template< typename... Args >
+   CombinedInPlaceCpuPackInfo(std::shared_ptr< LatticeStorageSpecification_T >& storageSecification, Args&&... args)
+      : storageSecification_(storageSecification), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...)
+   {}
+
+   ~CombinedInPlaceCpuPackInfo() override = default;
+   bool constantDataExchange() const override { return true; }
+   bool threadsafeReceiving() const override { return true; }
+
+   void unpackData(IBlock* receiver, stencil::Direction dir, mpi::RecvBuffer& buffer) override
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         return evenPackInfo_.unpackData(receiver, dir, buffer);
+      }
+      else
+      {
+         return oddPackInfo_.unpackData(receiver, dir, buffer);
+      }
+   }
+
+   void communicateLocal(const IBlock* sender, IBlock* receiver, stencil::Direction dir) override
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         return evenPackInfo_.communicateLocal(sender, receiver, dir);
+      }
+      else
+      {
+         return oddPackInfo_.communicateLocal(sender, receiver, dir);
+      }
+   }
+
+   void packDataImpl(const IBlock* sender, stencil::Direction dir, mpi::SendBuffer& outBuffer) const override
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         return evenPackInfo_.packDataImpl(sender, dir, outBuffer);
+      }
+      else
+      {
+         return oddPackInfo_.packDataImpl(sender, dir, outBuffer);
+      }
+   }
+
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         evenPackInfo_.pack(dir, buffer, block);
+      }
+      else
+      {
+         oddPackInfo_.pack(dir, buffer, block);
+      }
+   }
+
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         evenPackInfo_.unpack(dir, buffer, block);
+      }
+      else
+      {
+         oddPackInfo_.unpack(dir, buffer, block);
+      }
+   }
+
+   uint_t size(stencil::Direction dir, IBlock* block) const
+   {
+      if (storageSecification_->isEvenTimeStep())
+      {
+         return evenPackInfo_.size(dir, block);
+      }
+      else
+      {
+         return oddPackInfo_.size(dir, block);
+      }
+   }
+
+ private:
+   const std::shared_ptr< LatticeStorageSpecification_T >& storageSecification_;
+   EvenPackInfo evenPackInfo_;
+   OddPackInfo oddPackInfo_;
+};
+
+} // namespace walberla::lbm_generated
diff --git a/src/lbm_generated/communication/NonuniformCommData.h b/src/lbm_generated/communication/NonuniformCommData.h
new file mode 100644
index 0000000000000000000000000000000000000000..762dde86c5cf2a8336e3791dd3b56274b5f26df3
--- /dev/null
+++ b/src/lbm_generated/communication/NonuniformCommData.h
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformCommData.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/BlockDataHandling.h"
+
+#include "domain_decomposition/IBlock.h"
+
+#include "field/FlagField.h"
+
+#include "lbm_generated/field/PdfField.h"
+
+#include "stencil/Directions.h"
+
+#define USE_CELL_INTERVALS
+
+namespace walberla::lbm_generated {
+
+using PartialCoalescenceMaskField = FlagField< uint32_t >;
+
+namespace util {
+   void forEachSubdirection(const Vector3< cell_idx_t > mainDirection, const std::function< void(Vector3< cell_idx_t >) >& func);
+   bool forEachSubdirectionCancel(const Vector3< cell_idx_t > mainDirection,
+                               const std::function< bool(Vector3< cell_idx_t >) >& func);
+   void getSubdirections(const Vector3< cell_idx_t > mainDirection, std::vector< Vector3< cell_idx_t > > subdirs);
+
+   template< typename Stencil_T >
+   void forEachOrthogonalDirection(Vector3<cell_idx_t> d, std::function< void(Vector3< cell_idx_t >) > func);
+} // namespace util
+
+template< typename LatticeStorageSpecification_T >
+class NonuniformCommData
+{
+ private:
+   void registerFlags();
+   void computeBitMask();
+
+ public:
+   using Stencil              = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil;
+
+#if defined(USE_CELL_INTERVALS)
+   NonuniformCommData(IBlock* const block, uint_t xSize, uint_t ySize, uint_t zSize)
+      : block_(block), maskField_(xSize, ySize, zSize, 2),
+        interiorInterval(0, 0, 0, cell_idx_c(xSize) - 1, cell_idx_c(ySize) - 1, cell_idx_c(zSize) - 1)
+   {
+      registerFlags();
+      computeBitMask();
+   };
+#else
+   NonuniformCommData(IBlock* const block, const BlockDataID pdfFieldID, uint_t xSize, uint_t ySize, uint_t zSize)
+      : block_(block), pdfFieldID_(pdfFieldID), maskField_(xSize, ySize, zSize, 2)
+   {
+      registerFlags();
+      computeBitMask();
+   };
+#endif
+
+   bool operator==(const NonuniformCommData& other) { return this == &other; }
+   bool operator!=(const NonuniformCommData& other) { return this != &other; }
+
+   PartialCoalescenceMaskField& getMaskField() { return maskField_; }
+   const PartialCoalescenceMaskField& getMaskField() const { return maskField_; }
+
+ private:
+#if defined(USE_CELL_INTERVALS)
+   void prepareIntervals();
+   void setFlagOnInterval(const CellInterval & ci, const uint_t fIdx);
+#else
+   void prepareFlags();
+   void resetCornerSkippingOriginFlags();
+#endif
+
+   void setupCornerSkippingOrigins(stencil::Direction commDir);
+   void setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir);
+
+   bool haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir);
+
+   const IBlock* const block_;
+   PartialCoalescenceMaskField maskField_;
+
+#if defined(USE_CELL_INTERVALS)
+   const CellInterval interiorInterval;
+   std::vector< CellInterval > passThroughIntervals_;
+   std::vector< CellInterval > cornerSkippingOriginIntervals_;
+#endif
+};
+
+
+template< typename LatticeStorageSpecification_T >
+class NonuniformCommDataHandling
+   : public blockforest::AlwaysInitializeBlockDataHandling< NonuniformCommData< LatticeStorageSpecification_T > >
+{
+ public:
+   using CommmData_T = NonuniformCommData< LatticeStorageSpecification_T >;
+
+   NonuniformCommDataHandling(const weak_ptr< StructuredBlockForest >& blocks)
+      : blocks_(blocks){};
+
+   CommmData_T* initialize(IBlock* const block) override
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block)
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      return new CommmData_T(block, blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                             blocks->getNumberOfZCells(*block));
+   }
+
+ private:
+   const weak_ptr< StructuredBlockStorage > blocks_;
+};
+
+} // walberla::lbm_generated
+
+#include "lbm_generated/communication/NonuniformCommData.impl.h"
diff --git a/src/lbm_generated/communication/NonuniformCommData.impl.h b/src/lbm_generated/communication/NonuniformCommData.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a4bc3293087a5ed1e0c1aef261381511d908371
--- /dev/null
+++ b/src/lbm_generated/communication/NonuniformCommData.impl.h
@@ -0,0 +1,400 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformCommData.impl.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/all.h"
+
+#include "lbm_generated/communication/NonuniformCommData.h"
+
+#include "stencil/Directions.h"
+
+#define IDX_FLAG(d) (1 << d)
+
+#if !defined(USE_CELL_INTERVALS)
+#define INTERIOR_FLAG_BIT 29
+#define INTERIOR_FLAG (1 << INTERIOR_FLAG_BIT)
+
+#define PASS_THROUGH_FLAG_BIT 30
+#define PASS_THROUGH_FLAG (1 << PASS_THROUGH_FLAG_BIT)
+
+#define CORNER_SKIPPING_ORIGIN_FLAG_BIT 31
+#define CORNER_SKIPPING_ORIGIN_FLAG (1 << CORNER_SKIPPING_ORIGIN_FLAG_BIT)
+#endif
+
+using namespace walberla::lbm_generated::util;
+
+namespace walberla::lbm_generated {
+namespace util {
+
+/***********************************************************************************************************************
+ *                                    Utility Functions for handling directions                                        *
+ **********************************************************************************************************************/
+
+/**
+ * Iterates all sub-directions of a given direction vector and runs a callback on each of them.
+ * Subdirections are any nonzero directions obtained by truncating zero or more components of a direction
+ * vector to zero. The direction vector itself is contained in this set.
+ * @param mainDirection The direction whose subdirections will be iterated
+ * @param func          The callback that should be run for each subdirection
+ */
+inline void forEachSubdirection(const Vector3< cell_idx_t > mainDirection,
+                                const std::function< void(Vector3< cell_idx_t >) >& func)
+{
+   for (cell_idx_t z = std::min(0, mainDirection[2]); z <= std::max(0, mainDirection[2]); z++)
+   {
+      for (cell_idx_t y = std::min(0, mainDirection[1]); y <= std::max(0, mainDirection[1]); y++)
+      {
+         for (cell_idx_t x = std::min(0, mainDirection[0]); x <= std::max(0, mainDirection[0]); x++)
+         {
+            if (x == 0 && y == 0 && z == 0) continue;
+            func(Vector3< cell_idx_t >(x, y, z));
+         }
+      }
+   }
+}
+
+/**
+ * Iterates all sub-directions of a given direction vector and runs a callback on each of them.
+ * Subdirections are any nonzero directions obtained by truncating zero or more components of a direction
+ * vector to zero. The direction vector itself is contained in this set.
+ * @param mainDirection The direction whose subdirections will be iterated
+ * @param func          The callback that should be run for each subdirection. If the callback returns false, the
+ *                      iteration will be stopped.
+ * @return true if the iteration completed, false if it was canceled
+ */
+inline bool forEachSubdirectionCancel(const Vector3< cell_idx_t > mainDirection,
+                                      const std::function< bool(Vector3< cell_idx_t >) >& func)
+{
+   for (cell_idx_t z = std::min(0, mainDirection[2]); z <= std::max(0, mainDirection[2]); z++)
+   {
+      for (cell_idx_t y = std::min(0, mainDirection[1]); y <= std::max(0, mainDirection[1]); y++)
+      {
+         for (cell_idx_t x = std::min(0, mainDirection[0]); x <= std::max(0, mainDirection[0]); x++)
+         {
+            if (x == 0 && y == 0 && z == 0) continue;
+            if (!func(Vector3< cell_idx_t >(x, y, z))) return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+inline void getSubdirections(const Vector3< cell_idx_t > mainDirection,
+                             std::vector< Vector3< cell_idx_t > > subdirections)
+{
+   forEachSubdirection(mainDirection, [&](Vector3< cell_idx_t > v) { subdirections.push_back(v); });
+}
+
+/**
+ * Iterates all directions orthogonal to d that are part of the given stencil, and executes a function on
+ * each of them.
+ * @tparam Stencil_T The underlying stencil
+ * @param d
+ * @param func
+ */
+template< typename Stencil_T >
+inline void forEachOrthogonalDirection(Vector3< cell_idx_t > d, std::function< void(Vector3< cell_idx_t >) > func)
+{
+   for (cell_idx_t x = (d[0] == 0 ? -1 : 0); x <= (d[0] == 0 ? 1 : 0); x++)
+      for (cell_idx_t y = (d[1] == 0 ? -1 : 0); y <= (d[1] == 0 ? 1 : 0); y++)
+         for (cell_idx_t z = (d[2] == 0 ? -1 : 0); z <= (d[2] == 0 ? 1 : 0); z++)
+         {
+            if (x == 0 && y == 0 && z == 0) continue;
+            if (Stencil_T::containsDir(stencil::vectorToDirection(x, y, z))) { func(Vector3(x, y, z)); }
+         }
+}
+
+} // namespace util
+
+/***********************************************************************************************************************
+ *                                               Bit Mask Computation                                                  *
+ **********************************************************************************************************************/
+
+template< typename LatticeStorageSpecification_T >
+void NonuniformCommData< LatticeStorageSpecification_T >::registerFlags()
+{
+#if !defined(USE_CELL_INTERVALS)
+   maskField_.registerFlag(FlagUID(true), INTERIOR_FLAG_BIT);
+   maskField_.registerFlag(FlagUID(true), PASS_THROUGH_FLAG_BIT);
+   maskField_.registerFlag(FlagUID(true), CORNER_SKIPPING_ORIGIN_FLAG_BIT);
+#endif
+
+   for(auto it = Stencil::beginNoCenter(); it != Stencil::end(); ++it){
+      maskField_.registerFlag(FlagUID(true), Stencil::idx[*it]);
+   }
+}
+
+#if defined(USE_CELL_INTERVALS)
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::prepareIntervals()
+{
+   passThroughIntervals_.clear();
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         CellInterval ci;
+         maskField_.getGhostRegion(*commDir, ci, 2);
+         passThroughIntervals_.push_back(ci);
+      }
+   }
+}
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::setFlagOnInterval(const CellInterval & ci,
+                                                                                   const uint_t fIdx)
+{
+   for(auto c : ci){
+      maskField_.addFlag(c, IDX_FLAG(fIdx));
+   }
+}
+
+#else
+
+/**
+ * Prepares the INTERIOR and PASS_THROUGH flags.
+ * Sets the domain interior to INTERIOR. Sets any ghost layers corresponding to a coarse block
+ * or no block to PASS_THROUGH.
+ */
+template< typename LatticeStorageSpecification_T >
+void NonuniformCommData< LatticeStorageSpecification_T >::prepareFlags()
+{
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   // Set interior to origin
+   for (auto it = maskField_.beginXYZ(); it != maskField_.end(); ++it)
+   {
+      maskField_.addFlag(it.cell(), INTERIOR_FLAG);
+   }
+
+   // Set GLs to pass-through
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){
+            maskField_.addFlag(it.cell(), PASS_THROUGH_FLAG);
+         }
+      }
+   }
+}
+
+/**
+ * Resets the origin flag on any ghost layers.
+ */
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::resetCornerSkippingOriginFlags()
+{
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   // Remove origin flag from any ghost layers
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){
+            maskField_.removeFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG);
+         }
+      }
+   }
+}
+
+#endif
+
+
+/**
+ * Determines whether the current block has the smallest BlockID among all fine blocks of a
+ * given intersection volume.
+ * @tparam LatticeStorageSpecification_T
+ * @param cornerDir
+ * @return
+ */
+template< typename LatticeStorageSpecification_T >
+inline bool NonuniformCommData< LatticeStorageSpecification_T >::haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir)
+{
+   const IBlockID& myId = block_->getId();
+   const Block* b = dynamic_cast< const Block* >(block_);
+   return forEachSubdirectionCancel(cornerDir, [&](Vector3< cell_idx_t > dirVec) {
+     const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]);
+     if (b->neighborhoodSectionHasEquallySizedBlock(nSecIdx))
+     {
+        if (b->getNeighbor(nSecIdx, 0).getId() < myId) return false;
+     }
+     return true;
+   });
+}
+
+
+/**
+ * Sets up the feasible space for the given communication direction.
+ * Additionally to the field interior, marks every ghost layer slice corresponding to an adjacent coarse block,
+ * and the corresponding corner as feasible, if that corner also belongs to a coarse block and the current block
+ * has the smallest BlockID participating in the intersection.
+ * @param commDir A communication direction pointing toward an adjacent coarse block
+ */
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::setupCornerSkippingOrigins(stencil::Direction commDir)
+{
+#if defined(USE_CELL_INTERVALS)
+   cornerSkippingOriginIntervals_.clear();
+#else
+   resetCornerSkippingOriginFlags();
+#endif
+
+   const Block* b = dynamic_cast< const Block* >(block_);
+   Vector3<cell_idx_t> commDirVec(stencil::cx[commDir], stencil::cy[commDir], stencil::cz[commDir]);
+
+   // Iterate all orthogonal comm directions
+   forEachOrthogonalDirection< CommunicationStencil >(commDirVec, [&](Vector3< cell_idx_t > toSourceVec) {
+      const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(toSourceVec[0], toSourceVec[1], toSourceVec[2]);
+      // Find if there is a coarse block or no block at all in this neighborhood
+      // There are three possibilities: Coarse block, Same-level block or no block
+      // Finer block is not possible because of 2:1 balance
+      if (!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx))
+      {
+         // From this adjacent coarse block (or not-block, for boundary handling), corner skipping must be handled.
+         // Also, if there is no block, boundary handling in that region must be done on only
+         // one of the participating fine blocks.
+         Vector3< cell_idx_t > cornerDirVec = toSourceVec + commDirVec;
+
+         // If the current block has the smallest participating ID...
+         if (haveSmallestIdInIntersection(cornerDirVec))
+         {
+            const stencil::Direction toSourceDir = stencil::vectorToDirection(toSourceVec);
+
+            // ... Mark source GL region as corner skipping origin.
+#if defined(USE_CELL_INTERVALS)
+            CellInterval ci;
+            maskField_.getGhostRegion(toSourceDir, ci, 2);
+            cornerSkippingOriginIntervals_.push_back(ci);
+#else
+            for (auto it = maskField_.beginGhostLayerOnlyXYZ(toSourceDir); it != maskField_.end(); ++it)
+            {
+               maskField_.addFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG);
+            }
+#endif
+         }
+      }
+   });
+}
+
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir)
+{
+   uint_t fIdx = Stencil::idx[streamDir];
+   Cell streamVec(stencil::cx[streamDir], stencil::cy[streamDir], stencil::cz[streamDir]);
+
+#if defined(USE_CELL_INTERVALS)
+   CellInterval commSliceInterval;
+   maskField_.getGhostRegion(commDir, commSliceInterval, 2);
+
+   // Shift back once
+   commSliceInterval.shift(-streamVec);
+
+   // Intersect with interior and set flag on intersection volume
+   CellInterval interiorIntersection(interiorInterval);
+   interiorIntersection.intersect(commSliceInterval);
+   if(!interiorIntersection.empty()){
+      interiorIntersection.shift(streamVec);
+      setFlagOnInterval(interiorIntersection, fIdx);
+   }
+
+   // Intersect with pass-through regions...
+   for(auto passThroughIntersection : std::as_const(passThroughIntervals_)){
+      passThroughIntersection.intersect(commSliceInterval);
+      if(passThroughIntersection.empty()) continue;
+
+      // ... shift back once more ...
+      passThroughIntersection.shift(-streamVec);
+
+      // ... intersect with interior ...
+      interiorIntersection = interiorInterval;
+      interiorIntersection.intersect(passThroughIntersection);
+      if(!interiorIntersection.empty()){
+         interiorIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z());
+         setFlagOnInterval(interiorIntersection, fIdx);
+      }
+
+      // ... and with corner-skipping origin regions
+      for(auto originIntersection : std::as_const(cornerSkippingOriginIntervals_)){
+         originIntersection.intersect(passThroughIntersection);
+         if(!originIntersection.empty()){
+            originIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z());
+            setFlagOnInterval(originIntersection, fIdx);
+         }
+      }
+   }
+#else
+   for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, commDir); it != maskField_.end(); ++it){
+      Cell currentCell = it.cell();
+
+      // Shift back once
+      Cell shiftedCell = currentCell - streamVec;
+
+      if (maskField_.isFlagSet(shiftedCell, INTERIOR_FLAG)){
+         maskField_.addFlag(currentCell, IDX_FLAG(fIdx));
+      }
+      else if (maskField_.isFlagSet(shiftedCell, PASS_THROUGH_FLAG)){
+         // Shift back twice
+         shiftedCell -= streamVec;
+         if (maskField_.isPartOfMaskSet(shiftedCell, INTERIOR_FLAG | CORNER_SKIPPING_ORIGIN_FLAG)){
+            maskField_.addFlag(currentCell, IDX_FLAG(fIdx));
+         }
+
+      }
+      // else continue;
+   }
+#endif
+}
+
+/**
+ * Computes the partial coalescence bit mask on the mask field.
+ * Assumes that all flags are already registered at the field, and that the field
+ * has been initialized to zero.
+ */
+template< typename LatticeStorageSpecification_T >
+void NonuniformCommData< LatticeStorageSpecification_T >::computeBitMask()
+{
+#if defined(USE_CELL_INTERVALS)
+   prepareIntervals();
+#else
+   prepareFlags();
+#endif
+
+   const Block* b = dynamic_cast< const Block* >(block_);
+   for(auto commIt = CommunicationStencil::beginNoCenter(); commIt != CommunicationStencil::end(); ++commIt){
+      stencil::Direction commDir = *commIt;
+      const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(commDir);
+      if(b->neighborhoodSectionHasLargerBlock(nSecIdx)){
+         setupCornerSkippingOrigins(commDir);
+
+         for(uint_t streamDirIdx = 0; streamDirIdx < Stencil::d_per_d_length[commDir]; streamDirIdx++){
+            stencil::Direction streamDir = Stencil::d_per_d[commDir][streamDirIdx];
+            setupBitMaskSlice(commDir, streamDir);
+         }
+      }
+   }
+}
+
+} // walberla::lbm_generated
diff --git a/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b3e43a51dd7e7e8965e2152c58e493f73d8af84
--- /dev/null
+++ b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h
@@ -0,0 +1,317 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGeneratedPdfPackInfo.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/communication/NonUniformPackInfo.h"
+
+#include "core/DataTypes.h"
+#include "core/mpi/RecvBuffer.h"
+#include "core/mpi/SendBuffer.h"
+
+#include "lbm_generated/communication/NonuniformCommData.h"
+#include "lbm_generated/field/PdfField.h"
+
+namespace walberla::lbm_generated {
+using stencil::Direction;
+
+namespace internal
+{
+/*
+ * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to
+ * kernels generated for in-place streaming patterns. The generated code should not be templated.
+ */
+template< typename PdfField_T, bool inplace >
+class NonuniformPackingKernelsWrapper
+{
+ public:
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const  = 0;
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const = 0;
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const                                    = 0;
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const  = 0;
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const = 0;
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir) const                                    = 0;
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer,
+                           stencil::Direction dir) const = 0;
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci,
+                               unsigned char* outBuffer, Direction dir) const = 0;
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const = 0;
+   void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const = 0;
+
+   uint_t size(CellInterval& ci, Direction dir) const                   = 0;
+   uint_t size(CellInterval& ci) const                                  = 0;
+   uint_t redistributeSize(CellInterval& ci) const                      = 0;
+   uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const = 0;
+};
+
+/*
+ * Template Specialization for two-fields patterns, with trivial method wrappers.
+ */
+template< typename PdfField_T >
+class NonuniformPackingKernelsWrapper< PdfField_T, false >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const
+   {
+      kernels_.packAll(srcField, ci, outBuffer);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const
+   {
+      kernels_.unpackAll(dstField, ci, inBuffer);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const
+   {
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const
+   {
+      kernels_.packDirection(srcField, ci, outBuffer, dir);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const
+   {
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir) const
+   {
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir);
+   }
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer,
+                           stencil::Direction dir) const
+   {
+      kernels_.unpackRedistribute(dstField, ci, inBuffer, dir);
+   }
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci,
+                               unsigned char* outBuffer, Direction dir) const
+   {
+      kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir);
+   }
+
+   void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const
+   {
+      kernels_.unpackCoalescence(dstField, ci, inBuffer, dir);
+   }
+
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const
+   {
+      kernels_.zeroCoalescenceRegion(dstField, ci, dir);
+   }
+
+   uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+   uint_t redistributeSize(CellInterval& ci) const { return kernels_.redistributeSize(ci); }
+   uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const
+   {
+      return kernels_.partialCoalescenceSize(ci, dir);
+   }
+
+ private:
+   PackingKernels_T kernels_;
+};
+
+/*
+ * Template Specialization for in-place patterns, extracting the timestep from the lattice model.
+ */
+template< typename PdfField_T >
+class NonuniformPackingKernelsWrapper< PdfField_T, true >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packAll(srcField, ci, outBuffer, timestep);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackAll(dstField, ci, inBuffer, timestep);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packDirection(srcField, ci, outBuffer, dir, timestep);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep);
+   }
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer,
+                           stencil::Direction dir) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep);
+   }
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskField* maskField, CellInterval& ci,
+                               unsigned char* outBuffer, Direction dir) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep);
+   }
+
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval& ci, Direction dir) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.zeroCoalescenceRegion(dstField, ci, dir, timestep);
+   }
+
+   void unpackCoalescence(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, timestep);
+   }
+
+   uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+   uint_t redistributeSize(CellInterval& ci) const { return kernels_.redistributeSize(ci); }
+   uint_t partialCoalescenceSize(CellInterval& ci, Direction dir) const
+   {
+      return kernels_.partialCoalescenceSize(ci, dir);
+   }
+
+ private:
+   PackingKernels_T kernels_;
+};
+} // namespace internal
+
+/***********************************************************************************************************************
+ *                                                  Class Declaration                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T >
+class NonuniformGeneratedPdfPackInfo : public blockforest::communication::NonUniformPackInfo
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+   using Stencil      = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil;
+   using CommData_T           = NonuniformCommData< LatticeStorageSpecification_T >;
+
+
+   NonuniformGeneratedPdfPackInfo(const BlockDataID pdfFieldID, const BlockDataID commDataID)
+      : pdfFieldID_(pdfFieldID), commDataID_(commDataID){};
+
+   bool constantDataExchange() const override { return true; };
+   bool threadsafeReceiving() const override { return false; };
+
+   /// Equal Level
+   void unpackDataEqualLevel(Block* receiver, Direction dir, mpi::RecvBuffer& buffer) override;
+   void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir) override;
+
+   /// Coarse to Fine
+   void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir,
+                               mpi::RecvBuffer& buffer) override;
+   void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override;
+
+   /// Fine to Coarse
+   void prepareCoalescence(Block* coarseReceiver);
+   void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir,
+                               mpi::RecvBuffer& buffer) override;
+
+   void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override;
+
+ protected:
+   void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, mpi::SendBuffer& buffer) const override;
+
+   void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir,
+                                 mpi::SendBuffer& buffer) const override;
+   void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir,
+                                 mpi::SendBuffer& buffer) const override;
+
+ private:
+   /// Helper Functions
+   /// As in PdfFieldPackInfo.h
+   Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const;
+   bool areNeighborsInDirection(const Block * block, const BlockID & neighborID, const Vector3< cell_idx_t> dirVec) const;
+
+   CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir,
+                                        cell_idx_t width) const;
+   bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const;
+
+   void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+                                    std::vector< std::pair< Direction, CellInterval > >& intervals) const;
+   void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+                                  std::vector< std::pair< Direction, CellInterval > >& intervals) const;
+
+   CellInterval getCoarseBlockCoalescenceInterval(const Block * coarseBlock, const BlockID & fineBlockID,
+                                                  Direction dir, const PdfField_T * field) const;
+
+   const BlockDataID pdfFieldID_;
+   internal::NonuniformPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_;
+
+ public:
+   const BlockDataID commDataID_;
+};
+
+/***********************************************************************************************************************
+ *                                                  Factory Functions                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+std::shared_ptr< NonuniformGeneratedPdfPackInfo< PdfField_T > >
+   setupNonuniformPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks, const BlockDataID pdfFieldID,
+                                   const std::string& dataIdentifier = "NonuniformCommData");
+
+} // walberla::lbm_generated
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h"
diff --git a/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf36a61f9813989b5e975e6782f5c3ea138a3e96
--- /dev/null
+++ b/src/lbm_generated/communication/NonuniformGeneratedPdfPackInfo.impl.h
@@ -0,0 +1,490 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGeneratedPdfPackInfo.impl.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "NonuniformGeneratedPdfPackInfo.h"
+
+using namespace walberla::lbm_generated::util;
+
+namespace walberla::lbm_generated {
+
+/***********************************************************************************************************************
+ *                                                  Factory Functions                                                  *
+ **********************************************************************************************************************/
+
+
+/**
+ * Sets up a NonuniformGeneratedPdfPackInfo.
+ *
+ * @tparam LatticeStorageSpecification_T
+ * @tparam PackingKernels_T
+ * @param blocks
+ * @param pdfFieldID
+ * @param dataIdentifier
+ * @return
+ */
+template< typename PdfField_T>
+std::shared_ptr< NonuniformGeneratedPdfPackInfo< PdfField_T > >
+setupNonuniformPdfCommunication( const std::weak_ptr< StructuredBlockForest > & blocks,
+                                 const BlockDataID pdfFieldID,
+                                 const std::string & dataIdentifier)
+{
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+
+   auto sbf = blocks.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(sbf)
+
+   auto handling = std::make_shared<NonuniformCommDataHandling< LatticeStorageSpecification_T > >(blocks);
+   BlockDataID commDataID = sbf->addBlockData(handling, dataIdentifier);
+
+   return std::make_shared<NonuniformGeneratedPdfPackInfo< PdfField_T > >(pdfFieldID, commDataID);
+}
+
+
+/***********************************************************************************************************************
+ *                                          Equal Level Communication                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver,
+                                                                                              Direction dir,
+                                                                                              mpi::RecvBuffer& buffer)
+{
+   auto field = receiver->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   cell_idx_t gls = skipsThroughCoarseBlock(receiver, dir) ? 2 : 1;
+   field->getGhostRegion(dir, ci, gls, false);
+   uint_t size              = kernels_.size(ci, dir);
+   unsigned char* bufferPtr = buffer.skip(size);
+   kernels_.unpackDirection(field, ci, bufferPtr, dir);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalEqualLevel(
+   const Block* sender, Block* receiver, stencil::Direction dir)
+{
+   auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval srcRegion;
+   CellInterval dstRegion;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false);
+   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false);
+   kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataEqualLevelImpl(
+   const Block* sender, stencil::Direction dir, mpi::SendBuffer& buffer) const
+{
+   auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   field->getSliceBeforeGhostLayer(dir, ci, gls, false);
+   unsigned char* bufferPtr = buffer.forward(kernels_.size(ci, dir));
+   kernels_.packDirection(field, ci, bufferPtr, dir);
+}
+
+/***********************************************************************************************************************
+ *                                          Coarse to Fine Communication                                               *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl(
+   const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, mpi::SendBuffer& buffer) const
+{
+   auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals);
+
+   for (auto t : intervals)
+   {
+      CellInterval ci          = t.second;
+      unsigned char* bufferPtr = buffer.forward(kernels_.size(ci));
+      kernels_.packAll(field, ci, bufferPtr);
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataCoarseToFine(
+   Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, mpi::RecvBuffer& buffer)
+{
+   auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals);
+
+   for (auto t : intervals)
+   {
+      Direction d              = t.first;
+      CellInterval ci          = t.second;
+      uint_t size              = kernels_.redistributeSize(ci);
+      unsigned char* bufferPtr = buffer.skip(size);
+      kernels_.unpackRedistribute(field, ci, bufferPtr, d);
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine(
+   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir)
+{
+   auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > srcIntervals;
+   getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals);
+
+   std::vector< std::pair< Direction, CellInterval > > dstIntervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals);
+
+   WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size())
+
+   for(size_t index = 0; index < srcIntervals.size(); index++)
+   {
+      CellInterval srcInterval = srcIntervals[index].second;
+
+      Direction unpackDir      = dstIntervals[index].first;
+      CellInterval dstInterval = dstIntervals[index].second;
+
+      uint_t packSize      = kernels_.size(srcInterval);
+
+#ifndef NDEBUG
+      Direction const packDir        = srcIntervals[index].first;
+      WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
+      uint_t unpackSize = kernels_.redistributeSize(dstInterval);
+      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+      // TODO: This is a dirty workaround. Code-generate direct redistribution!
+      std::vector< unsigned char > buffer(packSize);
+      kernels_.packAll(srcField, srcInterval, &buffer[0]);
+      kernels_.unpackRedistribute(dstField, dstInterval, &buffer[0], unpackDir);
+   }
+}
+
+/***********************************************************************************************************************
+ *                                          Fine to Coarse Communication                                               *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedPdfPackInfo< PdfField_T >::prepareCoalescence(Block* coarseReceiver)
+{
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+
+   for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
+      if(coarseReceiver->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+         CellInterval ci;
+         dstField->getSliceBeforeGhostLayer(*it, ci, 1);
+         kernels_.zeroCoalescenceRegion(dstField, ci, *it);
+      }
+   }
+}
+
+template< typename PdfField_T>
+void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::unpackDataFineToCoarse(
+   walberla::Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir,
+   walberla::mpi::RecvBuffer& buffer)
+{
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+
+   CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField);
+   uint_t size = kernels_.size(ci, dir);
+   unsigned char* bufferPtr = buffer.skip(size);
+   kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir);
+}
+
+template< typename PdfField_T>
+void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse(
+   const walberla::Block* fineSender, walberla::Block* coarseReceiver, walberla::stencil::Direction dir)
+{
+   Block * varFineSender = const_cast< Block * >(fineSender);
+   auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
+   auto srcCommData   = varFineSender->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskField * maskField = &(srcCommData->getMaskField());
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+   Direction invDir = stencil::inverseDir[dir];
+
+   CellInterval srcInterval;
+   srcField->getGhostRegion(dir, srcInterval, 2);
+   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
+
+   CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
+                                                                invDir, dstField);
+
+#ifndef NDEBUG
+   uint_t unpackSize = kernels_.size(dstInterval, invDir);
+   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+   // TODO: This is a dirty workaround. Code-generate direct redistribution!
+   std::vector< unsigned char > buffer(packSize);
+   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, &buffer[0], dir);
+   kernels_.unpackCoalescence(dstField, dstInterval, &buffer[0], invDir);
+}
+
+template< typename PdfField_T>
+void walberla::lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl(
+   const walberla::Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir,
+   walberla::mpi::SendBuffer& buffer) const
+{
+   Block* varBlock = const_cast< Block* >(fineSender);
+   auto srcField   = varBlock->getData< PdfField_T >(pdfFieldID_);
+   auto commData  = varBlock->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskField * maskField = &(commData->getMaskField());
+
+   CellInterval ci;
+   srcField->getGhostRegion(dir, ci, 2);
+   uint_t size = kernels_.partialCoalescenceSize(ci, dir);
+   unsigned char* bufferPtr = buffer.forward(size);
+   kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir);
+}
+
+/***********************************************************************************************************************
+ *                                                  Helper Functions                                                   *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+inline Vector3< cell_idx_t >
+NonuniformGeneratedPdfPackInfo< PdfField_T >::getNeighborShift(const BlockID& fineBlock,
+                                                                                     stencil::Direction dir) const
+{
+   // dir: direction from coarse to fine block, or vice versa
+   Vector3< cell_idx_t > shift;
+
+   uint_t const branchId = fineBlock.getBranchId();
+
+   shift[0] = (stencil::cx[dir] == 0) ? (((branchId & uint_t(1)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+              cell_idx_t(0);
+   shift[1] = (stencil::cy[dir] == 0) ? (((branchId & uint_t(2)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+              cell_idx_t(0);
+   shift[2] = (Stencil::D == uint_t(3)) ?
+              ((stencil::cz[dir] == 0) ? (((branchId & uint_t(4)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+               cell_idx_t(0)) :
+              cell_idx_t(0);
+
+   return shift;
+}
+
+/**
+ * Returns the part of a cell interval's hull of given width in direction dirVec.
+ * @param ci        The original cell interval
+ * @param dirVec    Direction Vector
+ * @param width     Width of the hull
+ * @return          Interval forming the part of the hull
+ */
+template< typename PdfField_T>
+inline CellInterval NonuniformGeneratedPdfPackInfo< PdfField_T >::intervalHullInDirection(
+   const CellInterval& ci, const Vector3< cell_idx_t > dirVec, cell_idx_t width) const
+{
+   CellInterval result(ci);
+   for (uint_t i = 0; i < Stencil::D; i++)
+   {
+      if (dirVec[i] == 1)
+      {
+         result.min()[i] = result.max()[i] + cell_idx_t(1);
+         result.max()[i] += width;
+      }
+      if (dirVec[i] == -1)
+      {
+         result.max()[i] = result.min()[i] - cell_idx_t(1);
+         result.min()[i] -= width;
+      }
+   }
+
+   return result;
+}
+
+/**
+ * For edge or corner directions, checks if a coarser block is part of the respective edge or corner intersection.
+ * @param block The local block
+ * @param dir   The direction to check
+ * @return      `true`  if dir is an edge or corner direction skipping through a coarser block.
+ */
+template< typename PdfField_T>
+inline bool NonuniformGeneratedPdfPackInfo< PdfField_T >::skipsThroughCoarseBlock(
+   const Block* block, const Direction dir) const
+{
+   Vector3< cell_idx_t > dirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]);
+   bool coarseBlockFound = false;
+   forEachSubdirectionCancel(dirVec, [&](Vector3< cell_idx_t > subdir) {
+     coarseBlockFound =
+        coarseBlockFound || block->neighborhoodSectionHasLargerBlock(
+           blockforest::getBlockNeighborhoodSectionIndex(subdir[0], subdir[1], subdir[2]));
+     return !coarseBlockFound;
+   });
+
+   return coarseBlockFound;
+}
+
+/**
+ * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval)
+ * mapping sub-directions of the communication direction to cell intervals on the coarse block interior
+ * whose data must be communicated <i>as if</i> communicating in those sub-directions.
+ * @param fineBlockID   ID of the fine block
+ * @param dir           Direction from the coarse to the fine block
+ * @param field         Pointer to the PDF field on the coarse block
+ * @param intervals     Vector that will be filled with the computed intervals
+ */
+template< typename PdfField_T>
+inline void NonuniformGeneratedPdfPackInfo< PdfField_T >::getCoarseBlockCommIntervals(
+   const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   std::vector< std::pair< Direction, CellInterval > >& intervals) const
+{
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir);
+
+   CellInterval mainSlice;
+   field->getSliceBeforeGhostLayer(dir, mainSlice, 1, false);
+
+   // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift
+   for (uint_t i = 0; i != Stencil::D; ++i)
+   {
+      if (shift[i] == cell_idx_t(-1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1);
+      }
+      if (shift[i] == cell_idx_t(1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2));
+      }
+   }
+
+   intervals.emplace_back(dir, mainSlice);
+
+   Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] };
+
+   // Get extended slices in all tangential directions for the diagonal part of communication
+   forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) {
+     CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(1));
+     Direction subCommDir      = stencil::vectorToDirection(commDirVec - t);
+     if(CommunicationStencil::containsDir(subCommDir)){
+        intervals.emplace_back(subCommDir, hullInterval);
+     }
+   });
+}
+
+/**
+ * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval)
+ * mapping sub-directions of the communication direction to cell intervals on the fine block whose data must
+ * be communicated <i>as if</i> communicating in those sub-directions.
+ * @param fineBlockID   ID of the fine block
+ * @param dir           Direction from the fine to the coarse block
+ * @param field         Pointer to the PDF Field on the fine block
+ * @param intervals     Vector that will be filled with the computed intervals
+ */
+template< typename PdfField_T>
+inline void NonuniformGeneratedPdfPackInfo< PdfField_T >::getFineBlockCommIntervals(
+   const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   std::vector< std::pair< Direction, CellInterval > >& intervals) const
+{
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir);
+
+   CellInterval mainSlice;
+   field->getGhostRegion(dir, mainSlice, 2, false);
+   intervals.emplace_back(dir, mainSlice);
+
+   Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] };
+
+   forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) {
+     CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(2));
+     Direction subCommDir      = stencil::vectorToDirection(commDirVec + t);
+     if(CommunicationStencil::containsDir(subCommDir)){
+        intervals.emplace_back(subCommDir, hullInterval);
+     }
+   });
+}
+/**
+ * Checks whether or not the block with ID `neighborID` is a neighbor of `block` in direction `dir`.
+ */
+template< typename PdfField_T>
+bool NonuniformGeneratedPdfPackInfo< PdfField_T >::areNeighborsInDirection(
+   const Block* block, const BlockID& neighborID, const Vector3< cell_idx_t> dirVec) const
+{
+   uint_t const nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]);
+   uint_t const nSecSize = block->getNeighborhoodSectionSize(nSecIdx);
+
+   for(uint_t i = 0; i < nSecSize; i++){
+      if(block->getNeighborId(nSecIdx, i) == neighborID){
+         return true;
+      }
+   }
+   return false;
+}
+
+template< typename PdfField_T>
+CellInterval NonuniformGeneratedPdfPackInfo< PdfField_T >::getCoarseBlockCoalescenceInterval(
+   const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, const PdfField_T* field) const
+{
+   Direction mainDir(dir);
+   Vector3< cell_idx_t > commDirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]);
+   Vector3< cell_idx_t > mainDirVec(commDirVec);
+   bool isAsymmetric = !areNeighborsInDirection(coarseBlock, fineBlockID, commDirVec);
+
+   // If asymmetric, find the main subdirection
+   if(isAsymmetric){
+      mainDirVec = Vector3< cell_idx_t >(0);
+      forEachSubdirection(commDirVec, [&](Vector3< cell_idx_t > subdirVec){
+         if(areNeighborsInDirection(coarseBlock, fineBlockID, subdirVec)){
+            // -dir is one main communication direction from F to C, but, due to periodicity,
+            // it might not be the only one. Find the main comm direction from the subdirections
+            // that is largest in the 1-norm.
+            if(subdirVec.sqrLength() > mainDirVec.sqrLength()) mainDirVec = subdirVec;
+         }
+      });
+      mainDir = stencil::vectorToDirection(mainDirVec);
+   }
+
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, mainDir);
+
+   CellInterval mainSlice;
+   field->getSliceBeforeGhostLayer(mainDir, mainSlice, 1, false);
+
+   // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift
+   for (uint_t i = 0; i != Stencil::D; ++i)
+   {
+      if (shift[i] == cell_idx_t(-1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1);
+      }
+      if (shift[i] == cell_idx_t(1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2));
+      }
+   }
+
+   CellInterval commSlice(mainSlice);
+
+   // If asymmetric, find coalescence slice as hull of main slice
+   if(isAsymmetric){
+      commSlice = intervalHullInDirection(mainSlice, mainDirVec - commDirVec, 1);
+   }
+
+   return commSlice;
+}
+
+} // walberla::lbm_generated
diff --git a/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h b/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..76d28617a2b7f7be888eb1ed84ecb945a23bc229
--- /dev/null
+++ b/src/lbm_generated/communication/UniformGeneratedPdfPackInfo.h
@@ -0,0 +1,291 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGeneratedPdfPackInfo.h
+//! \ingroup lbm
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \brief Class Template for Lattice Boltzmann PDF Pack Infos using code-generated kernels
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "communication/UniformPackInfo.h"
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
+#include "lbm/field/PdfField.h"
+
+#include "stencil/Directions.h"
+
+namespace walberla
+{
+using communication::UniformPackInfo;
+
+namespace lbm_generated
+{
+using stencil::Direction;
+
+namespace internal
+{
+/*
+ * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to
+ * kernels generated for in-place streaming patterns. The generated code should not be templated.
+ */
+template< typename PdfField_T, bool inplace >
+class UniformPackingKernelsWrapper
+{
+ public:
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const  = 0;
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const = 0;
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const                                    = 0;
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const  = 0;
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const = 0;
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, const Direction dir) const                                    = 0;
+
+   uint_t size(CellInterval& ci, const Direction dir) const = 0;
+   uint_t size(CellInterval& ci) const                = 0;
+};
+
+/*
+ * Template Specialization for two-fields patterns, with trivial method wrappers.
+ */
+template< typename PdfField_T >
+class UniformPackingKernelsWrapper< PdfField_T, false >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const
+   {
+      kernels_.packAll(srcField, ci, outBuffer);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const
+   {
+      kernels_.unpackAll(dstField, ci, inBuffer);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const
+   {
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const
+   {
+      kernels_.packDirection(srcField, ci, outBuffer, dir);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const
+   {
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, const Direction dir) const
+   {
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir);
+   }
+
+   uint_t size(CellInterval& ci, const Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+
+ private:
+   PackingKernels_T kernels_;
+};
+
+/*
+ * Template Specialization for in-place patterns, extracting the timestep from the lattice model.
+ */
+template< typename PdfField_T >
+class UniformPackingKernelsWrapper< PdfField_T, true >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packAll(srcField, ci, outBuffer, timestep);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackAll(dstField, ci, inBuffer, timestep);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                     CellInterval& dstInterval) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, const Direction dir) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packDirection(srcField, ci, outBuffer, dir, timestep);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, const Direction dir) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, const Direction dir) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep);
+   }
+
+   uint_t size(CellInterval& ci, const Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+
+ private:
+   PackingKernels_T kernels_;
+};
+} // namespace internal
+
+/**
+ * Pack Info class template for lattice Boltzmann PDF fields. Relies on a code-generated
+ * class providing kernel implementations for packing, unpacking and local copying of data.
+ *
+ * This template relies on a PackingKernels implementation generated by lbmpy_walberla.packing_kernels.
+ * The code generated part provides the kernels for transferring data between communication buffers
+ * and fields. The iteration slices are constructed by this class.
+ *
+ * The code-generated substructure enables the usage of arbitrary, in particular in-place streaming
+ * patterns.
+ *
+ * @tparam  PackingKernels_T Type of a PackingKernels implementation generated using
+ *          `lbmpy_walberla.generate_packing_kernels`.
+ *
+ * \ingroup lbm
+ */
+template< typename PdfField_T >
+class UniformGeneratedPdfPackInfo : public UniformPackInfo
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels;
+   using Stencil      = typename LatticeStorageSpecification_T::Stencil;
+
+   /**
+    * Constructor.
+    *
+    * @param pdfFieldID ID of the associated walberla::lbm::PdfField
+    * @param cellLayersToSend The amount of cell layers that should be communicated
+    * @param sendAll If true, instead of only those populations streaming in subdirections of the communication
+    *                direction, all populations will always be communicated.
+    *                \warning Be careful when using this option with any streaming pattern other than
+    *                the pull pattern. Other patterns store at least some of their post-collision
+    *                populations in neighbouring cells. This might lead to out-of-bounds errors when
+    *                copying to the outermost ghost layer! Solve this by adding an additional ghost layer
+    *                as a safety margin.
+    */
+   UniformGeneratedPdfPackInfo(const BlockDataID pdfFieldID, cell_idx_t cellLayersToSend = 1, bool sendAll = false)
+      : pdfFieldID_(pdfFieldID), ghostLayersToSend_(cellLayersToSend), sendAll_(sendAll)
+   {}
+
+   bool constantDataExchange() const override { return true; }
+   bool threadsafeReceiving() const override { return true; }
+
+   void unpackData(IBlock * receiver, Direction dir, mpi::RecvBuffer & buffer) override;
+   void communicateLocal(const IBlock * sender, IBlock * receiver, Direction dir) override;
+
+ protected:
+   void packDataImpl(const IBlock * sender, Direction dir, mpi::SendBuffer & buffer) const override;
+
+ private:
+   const BlockDataID pdfFieldID_;
+   internal::UniformPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_;
+   cell_idx_t ghostLayersToSend_;
+   bool sendAll_;
+};
+
+template< typename PdfField_T >
+void UniformGeneratedPdfPackInfo< PdfField_T >::unpackData( IBlock * receiver, Direction dir, mpi::RecvBuffer& buffer)
+{
+   auto field = receiver->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   field->getGhostRegion(dir, ci, ghostLayersToSend_, false);
+
+   if (sendAll_)
+   {
+      unsigned char* bufferPtr = buffer.skip(kernels_.size(ci));
+      kernels_.unpackAll(field, ci, bufferPtr);
+   }
+   else
+   {
+      uint_t size              = kernels_.size(ci, dir);
+      unsigned char* bufferPtr = buffer.skip(size);
+      kernels_.unpackDirection(field, ci, bufferPtr, dir);
+   }
+}
+
+template< typename PdfField_T >
+void UniformGeneratedPdfPackInfo< PdfField_T >::communicateLocal(const IBlock* sender, IBlock* receiver, Direction dir)
+{
+   auto srcField = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval srcRegion;
+   CellInterval dstRegion;
+   srcField->getSliceBeforeGhostLayer(dir, srcRegion, ghostLayersToSend_, false);
+   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, ghostLayersToSend_, false);
+
+   if (sendAll_) {
+      kernels_.localCopyAll(srcField, srcRegion, dstField, dstRegion);
+   }
+   else
+   {
+      kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir);
+   }
+}
+
+template< typename PdfField_T>
+void UniformGeneratedPdfPackInfo< PdfField_T >:: packDataImpl(const IBlock* sender, Direction dir, mpi::SendBuffer& buffer) const
+{
+   auto field = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   field->getSliceBeforeGhostLayer(dir, ci, ghostLayersToSend_, false);
+
+   if (sendAll_)
+   {
+      unsigned char* bufferPtr = buffer.forward(kernels_.size(ci));
+      kernels_.packAll(field, ci, bufferPtr);
+   }
+   else
+   {
+      unsigned char* bufferPtr = buffer.forward(kernels_.size(ci, dir));
+      kernels_.packDirection(field, ci, bufferPtr, dir);
+   }
+}
+
+} // namespace lbm
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/evaluation/CMakeLists.txt b/src/lbm_generated/evaluation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..922cf93c3cb989af797b913baa227a2cf1735b23
--- /dev/null
+++ b/src/lbm_generated/evaluation/CMakeLists.txt
@@ -0,0 +1,4 @@
+target_sources( lbm_generated
+    PRIVATE
+    PerformanceEvaluation.h
+    )
diff --git a/src/lbm_generated/evaluation/PerformanceEvaluation.h b/src/lbm_generated/evaluation/PerformanceEvaluation.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fb7e934a2506ca360af12882a0775bcf8281eb6
--- /dev/null
+++ b/src/lbm_generated/evaluation/PerformanceEvaluation.h
@@ -0,0 +1,415 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PerformanceEvaluation.h
+//! \ingroup lbm_generated
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/Hostname.h"
+#include "core/Set.h"
+#include "core/waLBerlaBuildInfo.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/logging/Logging.h"
+#include "core/mpi/MPIManager.h"
+#include "core/uid/SUID.h"
+
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/CellCounter.h"
+#include "field/FlagUID.h"
+
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <sstream>
+
+
+namespace walberla::lbm_generated {
+
+
+//**********************************************************************************************************************
+/*!
+*   \brief Class for evaluating the performance of LBM simulations
+*/
+//**********************************************************************************************************************
+template< typename CellCounter_T, typename FluidCellCounter_T >
+class PerformanceEvaluationBase
+{
+public:
+
+   PerformanceEvaluationBase( const weak_ptr< StructuredBlockStorage > & blocks,
+                              const CellCounter_T & cellCounter, const FluidCellCounter_T & fluidCellCounter,
+                              const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                              const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() );
+   
+   void refresh();
+
+   void logResultOnRoot( const uint_t timeSteps, const double time ) const
+   {
+      WALBERLA_LOG_RESULT_ON_ROOT( "Simulation performance:\n" << loggingString( timeSteps, time ) )
+   }
+
+   void logInfoOnRoot( const uint_t timeSteps, const double time ) const
+   {
+      WALBERLA_LOG_INFO_ON_ROOT( "Simulation performance:\n" << loggingString( timeSteps, time ) )
+   }
+
+   std::string loggingString( const uint_t timeSteps, const double time ) const;
+   
+   void getResultsForSQLOnRoot( std::map< std::string, int > &         integerProperties,
+                                std::map< std::string, double > &      realProperties,
+                                std::map< std::string, std::string > & stringProperties,
+                                const uint_t timeSteps, const double time );
+   
+   static int processes() { return mpi::MPIManager::instance()->numProcesses(); }
+
+   int threads() const { return processes() * threadsPerProcess_; }
+   int cores()   const { return ( threadsPerCore_ == 0 ) ? 0 : ( threads() / threadsPerCore_ ); }
+
+   uint64_t allFineCells() const
+   {
+      uint64_t c( uint64_t(0) );
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         c += cells_.numberOfCells(i) * uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) );
+      return c;
+   }
+
+   double mlups( const uint_t timeSteps, const double time ) const
+   {
+      double m( 0.0 );
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         m += double_c( timeSteps * math::uintPow2(i) ) * double_c( cells_.numberOfCells(i) );
+      return m / ( time * 1000000.0 );
+   }
+
+   double mlupsPerProcess( const uint_t timeSteps, const double time ) const
+   {
+      return mlups( timeSteps, time ) / processes();
+   }
+
+   double mlupsPerCore( const uint_t timeSteps, const double time ) const
+   {
+      return ( cores() == 0 ) ? 0.0 : ( mlups( timeSteps, time ) / cores() );
+   }
+
+   double vMlups( const uint_t timeSteps, const double time ) const
+   {
+      double m( 0.0 );
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         m += double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) *
+              double_c( uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) ) * cells_.numberOfCells(i) );
+      return m / ( time * 1000000.0 );
+   }
+
+   double vMlupsPerProcess( const uint_t timeSteps, const double time ) const
+   {
+      return vMlups( timeSteps, time ) / processes();
+   }
+
+   double vMlupsPerCore( const uint_t timeSteps, const double time ) const
+   {
+      return ( cores() == 0 ) ? 0.0 : ( vMlups( timeSteps, time ) / cores() );
+   }
+
+   double mflups( const uint_t timeSteps, const double time ) const
+   {
+      double m( 0.0 );
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         m += double_c( timeSteps * math::uintPow2(i) ) * double_c( fluidCells_.numberOfCells(i) );
+      return m / ( time * 1000000.0 );
+   }
+
+   double mflupsPerProcess( const uint_t timeSteps, const double time ) const
+   {
+      return mflups( timeSteps, time ) / processes();
+   }
+
+   double mflupsPerCore( const uint_t timeSteps, const double time ) const
+   {
+      return ( cores() == 0 ) ? 0.0 : ( mflups( timeSteps, time ) / cores() );
+   }
+
+   double vMflups( const uint_t timeSteps, const double time ) const
+   {
+      double m( 0.0 );
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         m += double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) *
+              double_c( uint64_c( math::uintPow8( levels_ - uint_t(1) - i ) ) * fluidCells_.numberOfCells(i) );
+      return m / ( time * 1000000.0 );
+   }
+
+   double vMflupsPerProcess( const uint_t timeSteps, const double time ) const
+   {
+      return vMflups( timeSteps, time ) / processes();
+   }
+
+   double vMflupsPerCore( const uint_t timeSteps, const double time ) const
+   {
+      return ( cores() == 0 ) ? 0.0 : ( vMflups( timeSteps, time ) / cores() );
+   }
+
+   static double timeStepsPerSecond( const uint_t timeSteps, const double time ) { return double_c( timeSteps ) / time; }
+
+   double fineTimeStepsPerSecond( const uint_t timeSteps, const double time ) const
+   {
+      return double_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) / time;
+   }
+
+private:
+
+   int threadsPerProcess_{ 1 };
+   int threadsPerCore_{ 0 };
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+   uint_t levels_;
+
+   CellCounter_T cells_;
+   FluidCellCounter_T fluidCells_;
+
+}; // class PerformanceEvaluationBase
+
+
+
+//**********************************************************************************************************************
+/*!
+*   \brief Class for evaluating the performance of LBM simulations using fields
+*
+*   Assumes that in-between creating an object of this class and calling any of the member functions the number of cells
+*   and the number of fluid cells do not change! For simulations with static geometry, this is always the case.
+*/
+//**********************************************************************************************************************
+template< typename FlagField_T >
+class PerformanceEvaluation : public PerformanceEvaluationBase< field::CellCounter< FlagField_T >, field::CellCounter< FlagField_T > >
+{
+public:
+   PerformanceEvaluation( const weak_ptr< StructuredBlockStorage > & blocks,
+                          const ConstBlockDataID & flagFieldId, const Set< FlagUID > & fluid,
+                          const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
+                          const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() )
+                          : PerformanceEvaluationBase< field::CellCounter< FlagField_T >, field::CellCounter< FlagField_T > >(
+                              blocks,
+                              field::CellCounter< FlagField_T >( blocks, flagFieldId, Set< FlagUID >::emptySet(), requiredSelectors, incompatibleSelectors ),
+                              field::CellCounter< FlagField_T >( blocks, flagFieldId, fluid, requiredSelectors, incompatibleSelectors ),
+                              requiredSelectors, incompatibleSelectors )
+   {
+   }
+};
+
+
+template< typename CellCounter_T, typename FluidCellCounter_T >
+PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::PerformanceEvaluationBase(
+                                                                   const weak_ptr< StructuredBlockStorage > & blocks,
+                                                                   const CellCounter_T & cellCounter, const FluidCellCounter_T & fluidCellCounter,
+                                                                   const Set<SUID> & /*requiredSelectors*/, const Set<SUID> & /*incompatibleSelectors*/ )
+   : blocks_( blocks ),
+     cells_( cellCounter ),
+     fluidCells_( fluidCellCounter )
+{
+#ifdef _OPENMP
+   if( std::getenv( "OMP_NUM_THREADS" ) == NULL )
+      WALBERLA_ABORT( "If you are using a version of the program that was compiled with OpenMP you have to "
+                      "specify the environment variable \'OMP_NUM_THREADS\' accordingly!" );
+   threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) );
+#endif
+
+   if( std::getenv( "THREADS_PER_CORE" ) )
+      threadsPerCore_ = std::atoi( std::getenv( "THREADS_PER_CORE" ) );
+
+   refresh();
+}
+
+
+
+template< typename CellCounter_T, typename FluidCellCounter_T >
+void PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::refresh()
+{
+   auto blocks = blocks_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'PerformanceEvaluation' for a block storage object that doesn't exist anymore" )
+   
+   levels_ = blocks->getNumberOfLevels();
+   
+   cells_();
+   fluidCells_();
+}
+
+
+
+template< typename CellCounter_T, typename FluidCellCounter_T >
+std::string PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::loggingString( const uint_t timeSteps, const double time ) const
+{
+   std::ostringstream oss;
+
+   std::string na( "n/a *)" );
+
+   std::ostringstream threadsPerCoreString;
+   threadsPerCoreString << threadsPerCore_;
+
+   std::ostringstream coresString;
+   coresString << cores();
+
+   oss <<   "- processes:   " << processes()
+      << "\n- threads:     " << threads() << " (threads per process = " << threadsPerProcess_
+      << ", threads per core = " << ( ( threadsPerCore_ == 0 ) ? na : threadsPerCoreString.str() ) << ")"
+      << "\n- cores:       " << ( ( threadsPerCore_ == 0 ) ? na : coresString.str() )
+      << "\n- time steps:  " << timeSteps;
+
+   if( levels_ > uint_t(1) )
+   {
+      oss << " (on the coarsest grid, " << ( timeSteps * math::uintPow2( levels_ - uint_t(1) ) ) << " on the finest grid)";
+   }
+
+   oss << "\n- time:        " << time << " sec"
+      << "\n- cells:       " << cells_.numberOfCells();
+
+   if( levels_ > uint_t(1) )
+   {
+      oss << " (" << allFineCells() << " if everything were fine -> data reduction by factor of "
+         << ( real_c( allFineCells() ) / real_c( cells_.numberOfCells() ) ) << ")";
+   }
+
+   oss << "\n- fluid cells: " << fluidCells_.numberOfCells() << " ("
+      << ( real_c(100) * real_c( fluidCells_.numberOfCells() ) / real_c( cells_.numberOfCells() ) ) << " % of all cells)";
+
+   if( levels_ > uint_t(1) )
+   {
+      oss << "\n- distribution of cells to different grid levels:";
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+         oss << "\n   + level " << i <<": " << cells_.numberOfCells(i) << " cells (" << fluidCells_.numberOfCells(i) << " fluid cells = "
+         << ( real_c(100) * real_c( fluidCells_.numberOfCells(i) ) / real_c( cells_.numberOfCells(i) ) )
+         << " % of all cells on this level)";
+   }
+
+   std::ostringstream mlupsPerCoreString;
+   mlupsPerCoreString << mlupsPerCore( timeSteps, time );
+
+   std::ostringstream mflupsPerCoreString;
+   mflupsPerCoreString << mflupsPerCore( timeSteps, time );
+
+   oss << "\n- performance: " << mlups( timeSteps, time ) << " MLUPS (million lattice cell updates per second)"
+      << "\n               " << mlupsPerProcess( timeSteps, time ) << " MLUPS / process"
+      << "\n               " << ( ( threadsPerCore_ == 0 ) ? na : mlupsPerCoreString.str() ) << " MLUPS / core"
+      << "\n               " << mflups( timeSteps, time ) << " MFLUPS (million fluid lattice cell updates per second)"
+      << "\n               " << mflupsPerProcess( timeSteps, time ) << " MFLUPS / process"
+      << "\n               " << ( ( threadsPerCore_ == 0 ) ? na : mflupsPerCoreString.str() ) << " MFLUPS / core"
+      << "\n               " << timeStepsPerSecond( timeSteps, time ) << " time steps / second";
+
+   if( levels_ > uint_t(1) )
+   {
+      std::ostringstream vMlupsPerCoreString;
+      vMlupsPerCoreString << vMlupsPerCore( timeSteps, time );
+
+      std::ostringstream vMflupsPerCoreString;
+      vMflupsPerCoreString << vMflupsPerCore( timeSteps, time );
+
+      oss << "\n- 'virtual' performance (if everything were fine): " << vMlups( timeSteps, time ) << " MLUPS (million lattice cell updates per second)"
+         << "\n                                                   " << vMlupsPerProcess( timeSteps, time ) << " MLUPS / process"
+         << "\n                                                   " << ( ( threadsPerCore_ == 0 ) ? na : vMlupsPerCoreString.str() ) << " MLUPS / core"
+         << "\n                                                   " << vMflups( timeSteps, time ) << " MFLUPS (million fluid lattice cell updates per second)"
+         << "\n                                                   " << vMflupsPerProcess( timeSteps, time ) << " MFLUPS / process"
+         << "\n                                                   " << ( ( threadsPerCore_ == 0 ) ? na : vMflupsPerCoreString.str() ) << " MFLUPS / core"
+         << "\n                                                   " << fineTimeStepsPerSecond( timeSteps, time ) << " fine time steps / second";
+   }
+
+   oss << "\n- build / run information:"
+      << "\n   + host machine:   " << getHostName()
+      << "\n   + build machine:  " << WALBERLA_BUILD_MACHINE
+      << "\n   + git SHA1:       " << WALBERLA_GIT_SHA1
+      << "\n   + build type:     " << WALBERLA_BUILD_TYPE
+      << "\n   + compiler flags: " << WALBERLA_COMPILER_FLAGS;
+
+   if( threadsPerCore_ == 0 )
+      oss << "\n\n  *) only available if environment variable 'THREADS_PER_CORE' is set";
+
+   return oss.str();
+}
+
+
+
+template< typename CellCounter_T, typename FluidCellCounter_T >
+void PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::getResultsForSQLOnRoot( std::map< std::string, int > &         integerProperties,
+                                                                                             std::map< std::string, double > &      realProperties,
+                                                                                             std::map< std::string, std::string > & stringProperties,
+                                                                                             const uint_t timeSteps, const double time )
+{
+   WALBERLA_NON_ROOT_SECTION()
+   {
+      return;
+   }
+
+   integerProperties[ "levels" ]            = int_c( levels_ );
+   integerProperties[ "processes" ]         = processes();
+   integerProperties[ "threads" ]           = threads();
+   integerProperties[ "cores" ]             = cores();
+   integerProperties[ "threadsPerProcess" ] = threadsPerProcess_;
+   integerProperties[ "threadsPerCore" ]    = threadsPerCore_;
+
+   integerProperties[ "timeSteps" ] = int_c( timeSteps );
+   if( levels_ > uint_t(1) )
+      integerProperties[ "fineTimeSteps" ] = int_c( timeSteps * math::uintPow2( levels_ - uint_t(1) ) );
+
+   realProperties[ "time" ] = real_c( time );
+
+   realProperties[ "cells" ] = real_c( cells_.numberOfCells() );
+   if( levels_ > uint_t(1) )
+      realProperties[ "refinementCellsReduction" ] = real_c( allFineCells() ) / real_c( cells_.numberOfCells() );
+   realProperties[ "fluidCells" ] = real_c( fluidCells_.numberOfCells() );
+
+   if( levels_ > uint_t(1) )
+   {
+      for( uint_t i = uint_t(0); i < levels_; ++i )
+      {
+         std::ostringstream cells_i;
+         std::ostringstream fluidCells_i;
+
+         cells_i << "cells_" << i;
+         fluidCells_i << "fluidCells_" << i;
+
+         realProperties[ cells_i.str() ] = real_c( cells_.numberOfCells(i) );
+         realProperties[ fluidCells_i.str() ] = real_c( fluidCells_.numberOfCells(i) );
+      }
+   }
+
+   realProperties[ "MLUPS" ]              = double_c( mlups( timeSteps, time ) );
+   realProperties[ "MLUPS_process" ]      = double_c( mlupsPerProcess( timeSteps, time ) );
+   realProperties[ "MLUPS_core" ]         = double_c( mlupsPerCore( timeSteps, time ) );
+   realProperties[ "MFLUPS" ]             = double_c( mflups( timeSteps, time ) );
+   realProperties[ "MFLUPS_process" ]     = double_c( mflupsPerProcess( timeSteps, time ) );
+   realProperties[ "MFLUPS_core" ]        = double_c( mflupsPerCore( timeSteps, time ) );
+   realProperties[ "timeStepsPerSecond" ] = double_c( timeStepsPerSecond( timeSteps, time ) );
+
+   if( levels_ > uint_t(1) )
+   {
+      realProperties[ "vMLUPS" ]                 = double_c( vMlups( timeSteps, time ) );
+      realProperties[ "vMLUPS_process" ]         = double_c( vMlupsPerProcess( timeSteps, time ) );
+      realProperties[ "vMLUPS_core" ]            = double_c( vMlupsPerCore( timeSteps, time ) );
+      realProperties[ "vMFLUPS" ]                = double_c( vMflups( timeSteps, time ) );
+      realProperties[ "vMFLUPS_process" ]        = double_c( vMflupsPerProcess( timeSteps, time ) );
+      realProperties[ "vMFLUPS_core" ]           = double_c( vMflupsPerCore( timeSteps, time ) );
+      realProperties[ "fineTimeStepsPerSecond" ] = double_c( fineTimeStepsPerSecond( timeSteps, time ) );
+   }
+
+   stringProperties[ "hostMachine" ]   = std::string( getHostName() );
+   stringProperties[ "buildMachine" ]  = std::string( WALBERLA_BUILD_MACHINE );
+   stringProperties[ "gitVersion" ]    = std::string( WALBERLA_GIT_SHA1 );
+   stringProperties[ "buildType" ]     = std::string( WALBERLA_BUILD_TYPE );
+   stringProperties[ "compilerFlags" ] = std::string( WALBERLA_COMPILER_FLAGS );
+}
+
+} // namespace walberla::lbm_generated
diff --git a/src/lbm_generated/field/AddToStorage.h b/src/lbm_generated/field/AddToStorage.h
new file mode 100644
index 0000000000000000000000000000000000000000..afb86819931238443443f3095f73880aec401d36
--- /dev/null
+++ b/src/lbm_generated/field/AddToStorage.h
@@ -0,0 +1,207 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AddToStorage.h
+//! \ingroup lbm_generated
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "PdfField.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/uid/SUID.h"
+#include "field/blockforest/BlockDataHandling.h"
+
+namespace walberla::lbm_generated {
+
+namespace internal {
+   
+template< typename LatticeStorageSpecification_T >
+class PdfFieldHandling : public field::BlockDataHandling< PdfField<LatticeStorageSpecification_T>,
+                                                          LatticeStorageSpecification_T::Stencil::D == 2 >
+{
+public:
+
+   using PdfField_T = PdfField<LatticeStorageSpecification_T>;
+   using Base_T = field::BlockDataHandling<PdfField_T, LatticeStorageSpecification_T::Stencil::D == 2>;
+
+   PdfFieldHandling( const weak_ptr< StructuredBlockStorage > & blocks, const LatticeStorageSpecification_T & storageSpecification,
+                     const uint_t nrOfGhostLayers, const field::Layout & layout, const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr ) :
+      blocks_( blocks ), storageSpecification_( storageSpecification ),
+      nrOfGhostLayers_( nrOfGhostLayers ), layout_( layout ), alloc_( alloc ){}
+
+   inline void serialize( IBlock * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override
+   {
+      Base_T::serialize( block, id, buffer );
+   }
+
+   void serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child ) override
+   {
+      Base_T::serializeCoarseToFine( block, id, buffer, child );
+   }
+
+   void serializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer ) override
+   {
+      Base_T::serializeFineToCoarse( block, id, buffer );
+   }
+
+   void deserialize( IBlock * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override
+   {
+      Base_T::deserialize( block, id, buffer );
+   }
+
+   void deserializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer ) override
+   {
+      Base_T::deserializeCoarseToFine( block, id, buffer );
+   }
+
+   void deserializeFineToCoarse( Block * const block, const BlockDataID & id, mpi::RecvBuffer & buffer, const uint_t child ) override
+   {
+      Base_T::deserializeFineToCoarse( block, id, buffer, child );
+   }
+
+protected:
+
+   PdfField<LatticeStorageSpecification_T> * allocate( IBlock * const block ) override
+   {
+      return allocateDispatch( block );
+   }
+
+   PdfField<LatticeStorageSpecification_T> * reallocate( IBlock * const block ) override
+   {
+      return allocateDispatch( block );
+   }
+
+private:
+
+
+   PdfField<LatticeStorageSpecification_T> * allocateDispatch( IBlock * const block )
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR( block )
+
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR( blocks )
+
+      return new PdfField_T( blocks->getNumberOfXCells( *block ), blocks->getNumberOfYCells( *block ), blocks->getNumberOfZCells( *block ),
+                            storageSpecification_, nrOfGhostLayers_, layout_, alloc_ );
+   }
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+   LatticeStorageSpecification_T    storageSpecification_;
+
+   uint_t            nrOfGhostLayers_;
+   field::Layout     layout_;
+   shared_ptr< field::FieldAllocator<real_t> > alloc_;
+
+}; // class PdfFieldHandling
+
+} // namespace internal
+
+
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                  const LatticeStorageSpecification_T & storageSpecification,
+                                  const uint_t ghostLayers,
+                                  const field::Layout & layout = field::fzyx,
+                                  const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                                  const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
+                                  const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
+{
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                   blocks, storageSpecification, ghostLayers, layout, alloc ),
+                                identifier, requiredSelectors, incompatibleSelectors );
+}
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                 const LatticeStorageSpecification_T & storageSpecification,
+                                 const field::Layout & layout = field::fzyx,
+                                 const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                                 const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
+                                 const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
+{
+   auto ghostLayers = uint_c(1);
+
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                  blocks, storageSpecification, ghostLayers, layout, alloc ),
+                               identifier, requiredSelectors, incompatibleSelectors );
+}
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                 const LatticeStorageSpecification_T & storageSpecification,
+                                 const Set<SUID> & requiredSelectors     = Set<SUID>::emptySet(),
+                                 const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet(),
+                                 const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
+{
+   auto ghostLayers = uint_c(1);
+   auto layout = field::fzyx;
+
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                  blocks, storageSpecification, ghostLayers, layout, alloc ),
+                               identifier, requiredSelectors, incompatibleSelectors );
+}
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                 const LatticeStorageSpecification_T & storageSpecification,
+                                 const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
+{
+   auto ghostLayers = uint_c(1);
+   auto layout = field::fzyx;
+   auto requiredSelectors = Set<SUID>::emptySet();
+   auto incompatibleSelectors = Set<SUID>::emptySet();
+
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                  blocks, storageSpecification, ghostLayers, layout, alloc ),
+                               identifier, requiredSelectors, incompatibleSelectors );
+}
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                 const LatticeStorageSpecification_T & storageSpecification,
+                                 const field::Layout & layout = field::fzyx,
+                                 const shared_ptr< field::FieldAllocator<real_t> > alloc = nullptr)
+{
+   auto ghostLayers = uint_c(1);
+   auto requiredSelectors = Set<SUID>::emptySet();
+   auto incompatibleSelectors = Set<SUID>::emptySet();
+
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                  blocks, storageSpecification, ghostLayers, layout, alloc ),
+                               identifier, requiredSelectors, incompatibleSelectors );
+}
+
+template< typename LatticeStorageSpecification_T, typename BlockStorage_T >
+BlockDataID addPdfFieldToStorage( const shared_ptr< BlockStorage_T > & blocks, const std::string & identifier,
+                                 const LatticeStorageSpecification_T & storageSpecification,
+                                 const uint_t ghostLayers,
+                                 const field::Layout & layout,
+                                 const shared_ptr< field::FieldAllocator<real_t> > alloc)
+{
+   auto requiredSelectors = Set<SUID>::emptySet();
+   auto incompatibleSelectors = Set<SUID>::emptySet();
+
+   return blocks->addBlockData( make_shared< internal::PdfFieldHandling< LatticeStorageSpecification_T > >(
+                                  blocks, storageSpecification, ghostLayers, layout, alloc ),
+                               identifier, requiredSelectors, incompatibleSelectors );
+}
+
+
+} // namespace walberla::lbm_generated
diff --git a/src/lbm_generated/field/CMakeLists.txt b/src/lbm_generated/field/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63bc11c8f920acc3e4c244488d72899fd7a24245
--- /dev/null
+++ b/src/lbm_generated/field/CMakeLists.txt
@@ -0,0 +1,5 @@
+target_sources( lbm_generated
+        PRIVATE
+        AddToStorage.h
+        PdfField.h
+        )
\ No newline at end of file
diff --git a/src/lbm_generated/field/PdfField.h b/src/lbm_generated/field/PdfField.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b7ee88fd5e9ee0be1dbfb46da6d6e524d5536
--- /dev/null
+++ b/src/lbm_generated/field/PdfField.h
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PdfField.h
+//! \ingroup lbm_generated
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "field/GhostLayerField.h"
+#include "field/SwapableCompare.h"
+
+
+namespace walberla::lbm_generated {
+
+template< typename LatticeStorageSpecification_T >
+class PdfField : public GhostLayerField< real_t, LatticeStorageSpecification_T::Stencil::Size >
+{
+public:
+
+   //** Type Definitions  **********************************************************************************************
+   /*! \name Type Definitions */
+   //@{
+   using LatticeStorageSpecification = LatticeStorageSpecification_T;
+   using Stencil = typename LatticeStorageSpecification_T::Stencil;
+
+   using value_type = typename GhostLayerField<real_t, Stencil::Size>::value_type;
+
+   using Ptr = typename GhostLayerField<real_t, Stencil::Size>::Ptr;
+   using ConstPtr = typename GhostLayerField<real_t, Stencil::Size>::ConstPtr;
+   //@}
+   //*******************************************************************************************************************
+
+   PdfField( const uint_t _xSize, const uint_t _ySize, const uint_t _zSize,
+            const LatticeStorageSpecification_T & storageSpecification,
+             const uint_t ghostLayers = uint_t(1), const field::Layout & _layout = field::zyxf,
+             const shared_ptr< field::FieldAllocator<real_t> > & alloc = shared_ptr< field::FieldAllocator<real_t> >() );
+
+   ~PdfField() override = default;
+
+   inline PdfField * clone()              const;
+   inline PdfField * cloneUninitialized() const;
+   inline PdfField * cloneShallowCopy()   const;
+
+
+   /////////////////////////////////////////////////
+   // Access functions (with stencil::Direction!) //
+   /////////////////////////////////////////////////
+
+   using GhostLayerField< real_t, Stencil::Size >::get;
+
+         real_t & get( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d )       { return get( x, y, z, Stencil::idx[d] ); }
+   const real_t & get( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) const { return get( x, y, z, Stencil::idx[d] ); }
+         real_t & get( const Cell & c, stencil::Direction d )       { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); }
+   const real_t & get( const Cell & c, stencil::Direction d ) const { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); }
+
+   using GhostLayerField< real_t, Stencil::Size >::operator();
+
+         real_t & operator()( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d )       { return get( x, y, z, Stencil::idx[d] ); }
+   const real_t & operator()( cell_idx_t x, cell_idx_t y, cell_idx_t z, stencil::Direction d ) const { return get( x, y, z, Stencil::idx[d] ); }
+         real_t & operator()( const Cell & c, stencil::Direction d )       { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); }
+   const real_t & operator()( const Cell & c, stencil::Direction d ) const { return get( c.x(), c.y(), c.z(), Stencil::idx[d] ); }
+
+
+protected:
+   //** Shallow Copy ***************************************************************************************************
+   /*! \name Shallow Copy */
+   //@{
+   inline PdfField( const PdfField< LatticeStorageSpecification_T > & other );
+   Field< real_t, Stencil::Size > * cloneShallowCopyInternal() const override { return new PdfField< LatticeStorageSpecification_T >( *this ); }
+   //@}
+   //*******************************************************************************************************************
+
+   LatticeStorageSpecification_T storageSpecification_;
+};
+
+
+
+template< typename LatticeStorageSpecification_T >
+PdfField< LatticeStorageSpecification_T >::PdfField( const uint_t _xSize, const uint_t _ySize, const uint_t _zSize,
+                                                    const LatticeStorageSpecification_T & storageSpecification,
+                                      const uint_t ghostLayers, const field::Layout & _layout,
+                                      const shared_ptr< field::FieldAllocator<real_t> > & alloc ) :
+
+   GhostLayerField< real_t, Stencil::Size >( _xSize, _ySize, _zSize, ghostLayers, _layout, alloc ),
+      storageSpecification_( storageSpecification )
+
+{
+#ifdef _OPENMP
+   // take care of proper thread<->memory assignment (first-touch allocation policy !)
+   this->setWithGhostLayer( real_t(0) );
+#endif
+   this->setWithGhostLayer( real_t(0) );
+}
+
+
+
+template< typename LatticeStorageSpecification_T >
+inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::clone() const
+{
+   return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::clone() );
+}
+
+template< typename LatticeStorageSpecification_T >
+inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::cloneUninitialized() const
+{
+   return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::cloneUninitialized() );
+}
+
+template< typename LatticeStorageSpecification_T >
+inline PdfField< LatticeStorageSpecification_T > * PdfField< LatticeStorageSpecification_T >::cloneShallowCopy() const
+{
+   return dynamic_cast< PdfField * >( GhostLayerField< real_t, Stencil::Size >::cloneShallowCopy() );
+}
+
+template< typename LatticeStorageSpecification_T >
+inline PdfField< LatticeStorageSpecification_T >::PdfField( const PdfField< LatticeStorageSpecification_T > & other )
+   : GhostLayerField< real_t, Stencil::Size >::GhostLayerField( other )
+{
+}
+
+} // namespace lbm
diff --git a/src/lbm_generated/gpu/AddToStorage.h b/src/lbm_generated/gpu/AddToStorage.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef8f28409709ad37244276e3b68269d0edcf19da
--- /dev/null
+++ b/src/lbm_generated/gpu/AddToStorage.h
@@ -0,0 +1,105 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AddToStorage.h
+//! \ingroup lbm_generated
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/uid/SUID.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/FieldCopy.h"
+
+#include "field/blockforest/BlockDataHandling.h"
+
+#include "GPUPdfField.h"
+
+namespace walberla::lbm_generated
+{
+
+namespace internal
+{
+
+template< typename LatticeStorageSpecification_T>
+GPUPdfField< LatticeStorageSpecification_T > * createGPUPdfField( const IBlock * const block,
+                           const StructuredBlockStorage * const bs,
+                           const LatticeStorageSpecification_T& storageSpecification,
+                           const uint_t ghostLayers,
+                           const field::Layout & layout,
+                           const bool usePitchedMem )
+{
+   using GPUField_T = GPUPdfField< LatticeStorageSpecification_T >;
+
+   auto gpuField = new GPUField_T(bs->getNumberOfXCells( *block ),
+                                  bs->getNumberOfYCells( *block ),
+                                  bs->getNumberOfZCells( *block ),
+                                  storageSpecification, ghostLayers,
+                                  layout, usePitchedMem);
+
+   return gpuField;
+}
+
+template< typename Field_T, typename LatticeStorageSpecification_T >
+GPUPdfField< LatticeStorageSpecification_T >*
+   createGPUPdfFieldFromCPUPdfField(const IBlock* const block, const StructuredBlockStorage* const,
+                                    const LatticeStorageSpecification_T& storageSpecification,
+                                    ConstBlockDataID cpuFieldID, const bool usePitchedMem, const bool copyCPUField = true)
+{
+   using GPUField_T = GPUPdfField< LatticeStorageSpecification_T >;
+
+   const Field_T* f = block->getData< Field_T >(cpuFieldID);
+
+   auto gpuField = new GPUField_T(f->xSize(), f->ySize(), f->zSize(), storageSpecification, f->nrOfGhostLayers(),
+                                  f->layout(), usePitchedMem);
+
+   if (copyCPUField)
+      gpu::fieldCpy(*gpuField, *f);
+
+   return gpuField;
+}
+
+} // namespace internal
+
+template< typename GPUField_T, typename LatticeStorageSpecification_T >
+BlockDataID addGPUPdfFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs,
+                                    const std::string & identifier,
+                                    const LatticeStorageSpecification_T& storageSpecification,
+                                    const Layout layout = fzyx,
+                                    const uint_t nrOfGhostLayers = 1,
+                                    const bool usePitchedMem = true )
+{
+
+   auto func = std::bind(internal::createGPUPdfField< LatticeStorageSpecification_T >,
+                         std::placeholders::_1, std::placeholders::_2, storageSpecification, nrOfGhostLayers, layout, usePitchedMem);
+   return bs->addStructuredBlockData< GPUPdfField< LatticeStorageSpecification_T > >(func, identifier);
+}
+
+template< typename Field_T, typename LatticeStorageSpecification_T >
+BlockDataID addGPUPdfFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs, ConstBlockDataID cpuFieldID,
+                                    const LatticeStorageSpecification_T& storageSpecification,
+                                    const std::string& identifier, const bool usePitchedMem = true, const bool copyCPUField = true)
+{
+   auto func = std::bind(internal::createGPUPdfFieldFromCPUPdfField< Field_T, LatticeStorageSpecification_T >,
+                         std::placeholders::_1, std::placeholders::_2, storageSpecification, cpuFieldID, usePitchedMem, copyCPUField);
+   return bs->addStructuredBlockData< GPUPdfField< LatticeStorageSpecification_T > >(func, identifier);
+}
+
+} // namespace walberla::lbm_generated
\ No newline at end of file
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d95855e61fd4238c2c0f201024f87abe7111107
--- /dev/null
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.h
@@ -0,0 +1,108 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicRecursiveTimeStepGPU.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/NonUniformGPUScheme.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <utility>
+
+#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+
+namespace walberla
+{
+
+using gpu::communication::NonUniformGPUScheme;
+
+namespace lbm_generated
+{
+
+/**
+ *
+ * @tparam LatticeStorageSpecification_T   Generated storage specification
+ * @tparam SweepCollection_T LBM SweepCollection (must be able to call stream, collide, streamCollide and
+ * streamOnlyNoAdvancement)
+ * @tparam BoundaryCollection_T LBM Boundary collection (Functor that runs all boundary kernels at call)
+ */
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+class BasicRecursiveTimeStepGPU
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using Stencil                       = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil          = typename LatticeStorageSpecification_T::CommunicationStencil;
+
+   using CommScheme = gpu::communication::NonUniformGPUScheme< CommunicationStencil >;
+   using PackInfo   = lbm_generated::NonuniformGeneratedGPUPdfPackInfo< PdfField_T >;
+
+   BasicRecursiveTimeStepGPU(std::shared_ptr< StructuredBlockForest >& sbfs, const BlockDataID& pdfFieldId,
+                             SweepCollection_T& sweepCollection, BoundaryCollection_T& boundaryCollection,
+                             std::shared_ptr< CommScheme >& commScheme, std::shared_ptr< PackInfo >& pdfFieldPackInfo)
+      : sbfs_(sbfs), pdfFieldId_(pdfFieldId), pdfFieldPackInfo_(pdfFieldPackInfo), commScheme_(commScheme),
+        sweepCollection_(sweepCollection), boundaryCollection_(boundaryCollection)
+   {
+#ifndef NDEBUG
+      for (auto& block : *sbfs)
+         WALBERLA_ASSERT(block.isDataOfType< PdfField_T >(pdfFieldId_),
+                         "Template parameter PdfField_T is of different type than BlockDataID pdfFieldId that is "
+                         "provided as constructor argument")
+#endif
+      maxLevel_ = sbfs->getDepth();
+
+      for (uint_t level = 0; level <= maxLevel_; level++)
+      {
+         std::vector< Block* > blocks;
+         sbfs->getBlocks(blocks, level);
+         blocks_.push_back(blocks);
+      }
+   };
+
+   ~BasicRecursiveTimeStepGPU() = default;
+
+   void operator()() { timestep(0); };
+   void addRefinementToTimeLoop(SweepTimeloop& timeloop, uint_t level = 0);
+   void test(uint_t maxLevel, uint_t level = 0);
+
+ private:
+   void timestep(uint_t level);
+   void ghostLayerPropagation(Block* block, gpuStream_t gpuStream);
+   std::function< void() > executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation = false);
+
+   std::function< void() > executeBoundaryHandlingOnLevel(uint_t level);
+
+   std::shared_ptr< StructuredBlockForest > sbfs_;
+   uint_t maxLevel_;
+   std::vector< std::vector< Block* > > blocks_;
+
+   const BlockDataID pdfFieldId_;
+   std::shared_ptr< PackInfo > pdfFieldPackInfo_;
+   std::shared_ptr< CommScheme > commScheme_;
+
+   SweepCollection_T& sweepCollection_;
+   BoundaryCollection_T& boundaryCollection_;
+};
+
+} // namespace lbm_generated
+} // namespace walberla
+
+#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h"
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bb43c3c874e160253dd096e87ee8c50e2aa08b3
--- /dev/null
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
@@ -0,0 +1,255 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicRecursiveTimeStep.impl.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "BasicRecursiveTimeStepGPU.h"
+
+namespace walberla {
+namespace lbm_generated {
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::timestep(uint_t level)
+{
+   std::vector<Block *> blocks;
+   sbfs_->getBlocks(blocks, level);
+
+   uint_t maxLevel = sbfs_->getDepth();
+
+   // 1.1 Collision
+   for(auto b: blocks){
+      sweepCollection_.streamCollide(b);
+   }
+
+   // 1.2 Recursive Descent
+   if(level < maxLevel){
+      timestep(level + 1);
+   }
+
+   // 1.3 Coarse to Fine Communication, receiving end
+   if(level != 0){
+      commScheme_->communicateCoarseToFine(level);
+   }
+
+   // 1.4 Equal-Level Communication
+   commScheme_->communicateEqualLevel(level);
+
+   // 1.5 Boundary Handling and Coalescence Preparation
+   for(auto b : blocks){
+      boundaryCollection_(b, nullptr);
+      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
+   }
+
+   // 1.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel){
+      commScheme_->communicateFineToCoarse(level + 1);
+   }
+
+   // Stop here if on coarsest level.
+   // Otherwise, continue to second subcycle.
+   if(level == 0) return;
+
+   // 2.1 Collision and Ghost-Layer Propagation
+   for(auto b: blocks){
+      ghostLayerPropagation(b);  // GL-Propagation first without swapping arrays...
+      sweepCollection_.streamCollide(b);                // then Stream-Collide on interior, and swap arrays
+   }
+
+   // 2.2 Recursive Descent
+   if(level < maxLevel){
+      timestep(level + 1);
+   }
+
+   // 2.4 Equal-Level Communication
+   commScheme_->communicateEqualLevel(level);
+
+   // 2.5 Boundary Handling and Coalescence Preparation
+   for(auto b : blocks){
+      boundaryCollection_(b, nullptr);
+      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
+   }
+
+   // 2.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel){
+      commScheme_->communicateFineToCoarse(level + 1);
+   }
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level)
+{
+   // 1.1 Collision
+   timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level));
+
+   // 1.2 Recursive Descent
+   if(level < maxLevel_){
+      addRefinementToTimeLoop(timeloop, level + 1);
+   }
+
+   // 1.3 Coarse to Fine Communication, receiving end
+   if(level != 0){
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateCoarseToFineFunctor(level), "Refinement Cycle: communicate coarse to fine on level " + std::to_string(level));
+   }
+
+   // 1.4 Equal-Level Communication
+   timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+
+   // 1.5 Boundary Handling and Coalescence Preparation
+   timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 1.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_){
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+   }
+
+   // Stop here if on coarsest level.
+   // Otherwise, continue to second subcycle.
+   if(level == 0) return;
+
+   // 2.1 Collision and Ghost-Layer Propagation
+   timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, true), "Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level));
+
+   // 2.2 Recursive Descent
+   if(level < maxLevel_)
+      addRefinementToTimeLoop(timeloop, level + 1);
+
+
+   // 2.4 Equal-Level Communication
+   timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+   // 2.5 Boundary Handling and Coalescence Preparation
+   timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 2.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_)
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+
+}
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::test(uint_t maxLevel, uint_t level)
+{
+   // 1.1 Collision
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: streamCollide on level " + std::to_string(level));
+
+   // 1.2 Recursive Descent
+   if(level < maxLevel){
+      test(maxLevel, level + 1);
+   }
+
+   // 1.3 Coarse to Fine Communication, receiving end
+   if(level != 0){
+      WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate coarse to fine on level " + std::to_string(level));
+   }
+
+   // 1.4 Equal-Level Communication
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+
+   // 1.5 Boundary Handling and Coalescence Preparation
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 1.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel){
+      WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+   }
+
+   // Stop here if on coarsest level.
+   // Otherwise, continue to second subcycle.
+   if(level == 0) return;
+
+   // 2.1 Collision and Ghost-Layer Propagation
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level));
+
+   // 2.2 Recursive Descent
+   if(level < maxLevel)
+      test(maxLevel, level + 1);
+
+
+   // 2.4 Equal-Level Communication
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+   // 2.5 Boundary Handling and Coalescence Preparation
+   WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 2.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel)
+      WALBERLA_LOG_INFO_ON_ROOT("Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation)
+{
+   return [level, withGhostLayerPropagation, this]()
+   {
+      if (withGhostLayerPropagation)
+      {
+         for(auto b: blocks_[level]){
+            ghostLayerPropagation(b, nullptr);
+            sweepCollection_.streamCollide(b, 0, nullptr);
+         }
+      }
+      else
+      {
+         for(auto b: blocks_[level]){
+            sweepCollection_.streamCollide(b, 0, nullptr);
+         }
+      }
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   };
+}
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+std::function<void()> BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeBoundaryHandlingOnLevel(uint_t level)
+{
+   return [this, level]() {
+      for (auto b : blocks_[level])
+      {
+         boundaryCollection_(b, nullptr);
+         if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b, nullptr);
+      }
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   };
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(
+   Block * block, gpuStream_t gpuStream)
+{
+   auto pdfField = block->getData<PdfField_T>(pdfFieldId_);
+
+   for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
+      // Propagate on ghost layers shadowing coarse or no blocks
+      if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+         CellInterval ci;
+         pdfField->getGhostRegion(*it, ci, 1);
+         sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci, gpuStream);
+      }
+   }
+}
+
+} // namespace lbm_generated
+} // namespace walberla
diff --git a/src/lbm_generated/gpu/CMakeLists.txt b/src/lbm_generated/gpu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f81e5f2b370d478473f4d02d3853c469c905799f
--- /dev/null
+++ b/src/lbm_generated/gpu/CMakeLists.txt
@@ -0,0 +1,12 @@
+target_sources( lbm_generated
+        PRIVATE
+        AddToStorage.h
+        BasicRecursiveTimeStepGPU.h
+        BasicRecursiveTimeStepGPU.impl.h
+        GPUPdfField.h
+        NonuniformGPUCommData.h
+        NonuniformGPUCommData.impl.h
+        NonuniformGeneratedGPUPdfPackInfo.h
+        NonuniformGeneratedGPUPdfPackInfo.impl.h
+        UniformGeneratedGPUPdfPackInfo.h
+        )
\ No newline at end of file
diff --git a/src/lbm_generated/gpu/GPUPdfField.h b/src/lbm_generated/gpu/GPUPdfField.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a9f59a116b8c4e7c5fcb4ebd817dcb5cad0a908
--- /dev/null
+++ b/src/lbm_generated/gpu/GPUPdfField.h
@@ -0,0 +1,66 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUPdfField.h
+//! \ingroup lbm_generated
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "gpu/GPUField.h"
+
+using namespace walberla::gpu;
+
+namespace walberla::lbm_generated {
+
+template< typename LatticeStorageSpecification_T >
+class GPUPdfField : public GPUField< real_t >
+{
+ public:
+
+   //** Type Definitions  **********************************************************************************************
+   /*! \name Type Definitions */
+   //@{
+   using LatticeStorageSpecification = LatticeStorageSpecification_T;
+   using Stencil = typename LatticeStorageSpecification_T::Stencil;
+
+   using value_type = typename GPUField<real_t>::value_type;
+   //@}
+   //*******************************************************************************************************************
+
+   GPUPdfField( uint_t _xSize, uint_t _ySize, uint_t _zSize,
+               const LatticeStorageSpecification_T & storageSpecification,
+               uint_t _nrOfGhostLayers, const Layout & _layout = zyxf, bool usePitchedMem = true );
+
+
+   ~GPUPdfField() = default;
+
+ protected:
+   LatticeStorageSpecification_T storageSpecification_;
+};
+
+
+
+template< typename LatticeStorageSpecification_T >
+GPUPdfField< LatticeStorageSpecification_T >::GPUPdfField( uint_t _xSize, uint_t _ySize, uint_t _zSize,
+                                                          const LatticeStorageSpecification_T & storageSpecification,
+                                                          uint_t ghostLayers, const Layout & layout, bool usePitchedMem) :
+                    GPUField< real_t>( _xSize, _ySize, _zSize, LatticeStorageSpecification_T::Stencil::Size, ghostLayers, layout, usePitchedMem ), storageSpecification_( storageSpecification )
+{
+}
+
+} // namespace lbm
\ No newline at end of file
diff --git a/src/lbm_generated/gpu/NonuniformGPUCommData.h b/src/lbm_generated/gpu/NonuniformGPUCommData.h
new file mode 100644
index 0000000000000000000000000000000000000000..795a9bcb5868c156f8c42dd94057f36361ca1e3d
--- /dev/null
+++ b/src/lbm_generated/gpu/NonuniformGPUCommData.h
@@ -0,0 +1,137 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGPUCommData.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/BlockDataHandling.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
+
+#include "domain_decomposition/IBlock.h"
+
+#include "field/FlagField.h"
+
+#include "lbm_generated/communication/NonuniformCommData.h"
+
+#include "stencil/Directions.h"
+
+#define USE_CELL_INTERVALS
+
+namespace walberla::lbm_generated {
+
+using PartialCoalescenceMaskFieldGPU = gpu::GPUField< uint32_t >;
+
+template< typename LatticeStorageSpecification_T >
+class NonuniformGPUCommData
+{
+ private:
+   void registerFlags();
+   void computeBitMask();
+   void syncDataGPU();
+
+ public:
+   using Stencil              = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil;
+
+#if defined(USE_CELL_INTERVALS)
+   NonuniformGPUCommData(IBlock* const block, uint_t xSize, uint_t ySize, uint_t zSize)
+      : block_(block), maskField_(xSize, ySize, zSize, 2),
+        maskFieldGPU_(xSize, ySize, zSize, 1, 2, field::fzyx),
+        interiorInterval(0, 0, 0, cell_idx_c(xSize) - 1, cell_idx_c(ySize) - 1, cell_idx_c(zSize) - 1)
+   {
+      registerFlags();
+      computeBitMask();
+      syncDataGPU();
+   };
+#else
+   NonuniformCommData(IBlock* const block, const BlockDataID pdfFieldID, uint_t xSize, uint_t ySize, uint_t zSize)
+      : block_(block), pdfFieldID_(pdfFieldID), maskField_(xSize, ySize, zSize, 2)
+   {
+      registerFlags();
+      computeBitMask();
+      syncDataGPU();
+   };
+#endif
+
+   bool operator==(const NonuniformGPUCommData& other) { return this == &other; }
+   bool operator!=(const NonuniformGPUCommData& other) { return this != &other; }
+
+   PartialCoalescenceMaskField& getMaskField() { return maskField_; }
+   const PartialCoalescenceMaskField& getMaskField() const { return maskField_; }
+
+   PartialCoalescenceMaskFieldGPU& getMaskFieldGPU() { return maskFieldGPU_; }
+   const PartialCoalescenceMaskFieldGPU& getMaskFieldGPU() const { return maskFieldGPU_; }
+
+ private:
+#if defined(USE_CELL_INTERVALS)
+   void prepareIntervals();
+   void setFlagOnInterval(const CellInterval & ci, const uint_t fIdx);
+#else
+   void prepareFlags();
+   void resetCornerSkippingOriginFlags();
+#endif
+
+   void setupCornerSkippingOrigins(stencil::Direction commDir);
+   void setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir);
+
+   bool haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir);
+
+   const IBlock* const block_;
+   PartialCoalescenceMaskField maskField_;
+   PartialCoalescenceMaskFieldGPU maskFieldGPU_;
+
+#if defined(USE_CELL_INTERVALS)
+   const CellInterval interiorInterval;
+   std::vector< CellInterval > passThroughIntervals_;
+   std::vector< CellInterval > cornerSkippingOriginIntervals_;
+#endif
+};
+
+
+template< typename LatticeStorageSpecification_T >
+class NonuniformGPUCommDataHandling
+   : public blockforest::AlwaysInitializeBlockDataHandling< NonuniformGPUCommData< LatticeStorageSpecification_T > >
+{
+ public:
+   using CommmData_T = NonuniformGPUCommData< LatticeStorageSpecification_T >;
+
+   NonuniformGPUCommDataHandling(const weak_ptr< StructuredBlockForest >& blocks)
+      : blocks_(blocks){};
+
+   CommmData_T* initialize(IBlock* const block) override
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block)
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      return new CommmData_T(block, blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                             blocks->getNumberOfZCells(*block));
+   }
+
+ private:
+   const weak_ptr< StructuredBlockStorage > blocks_;
+};
+
+} // walberla::lbm_generated
+
+#include "lbm_generated/gpu/NonuniformGPUCommData.impl.h"
diff --git a/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h b/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..47d6f033046b46d9d6156b6c91c0ffff6e82cf91
--- /dev/null
+++ b/src/lbm_generated/gpu/NonuniformGPUCommData.impl.h
@@ -0,0 +1,322 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGPUCommData.impl.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/all.h"
+
+#include "lbm_generated/gpu/NonuniformGPUCommData.h"
+
+#include "stencil/Directions.h"
+
+#define IDX_FLAG(d) (1 << d)
+
+#if !defined(USE_CELL_INTERVALS)
+#define INTERIOR_FLAG_BIT 29
+#define INTERIOR_FLAG (1 << INTERIOR_FLAG_BIT)
+
+#define PASS_THROUGH_FLAG_BIT 30
+#define PASS_THROUGH_FLAG (1 << PASS_THROUGH_FLAG_BIT)
+
+#define CORNER_SKIPPING_ORIGIN_FLAG_BIT 31
+#define CORNER_SKIPPING_ORIGIN_FLAG (1 << CORNER_SKIPPING_ORIGIN_FLAG_BIT)
+#endif
+
+using namespace walberla::lbm_generated::util;
+
+namespace walberla::lbm_generated {
+
+/***********************************************************************************************************************
+ *                                               Bit Mask Computation                                                  *
+ **********************************************************************************************************************/
+
+template< typename LatticeStorageSpecification_T >
+void NonuniformGPUCommData< LatticeStorageSpecification_T >::registerFlags()
+{
+#if !defined(USE_CELL_INTERVALS)
+   maskField_.registerFlag(FlagUID(true), INTERIOR_FLAG_BIT);
+   maskField_.registerFlag(FlagUID(true), PASS_THROUGH_FLAG_BIT);
+   maskField_.registerFlag(FlagUID(true), CORNER_SKIPPING_ORIGIN_FLAG_BIT);
+#endif
+
+   for(auto it = Stencil::beginNoCenter(); it != Stencil::end(); ++it){
+      maskField_.registerFlag(FlagUID(true), Stencil::idx[*it]);
+   }
+}
+
+#if defined(USE_CELL_INTERVALS)
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::prepareIntervals()
+{
+   passThroughIntervals_.clear();
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         CellInterval ci;
+         maskField_.getGhostRegion(*commDir, ci, 2);
+         passThroughIntervals_.push_back(ci);
+      }
+   }
+}
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setFlagOnInterval(const CellInterval & ci,
+                                                                                   const uint_t fIdx)
+{
+   for(auto c : ci){
+      maskField_.addFlag(c, IDX_FLAG(fIdx));
+   }
+}
+
+#else
+
+/**
+ * Prepares the INTERIOR and PASS_THROUGH flags.
+ * Sets the domain interior to INTERIOR. Sets any ghost layers corresponding to a coarse block
+ * or no block to PASS_THROUGH.
+ */
+template< typename LatticeStorageSpecification_T >
+void NonuniformCommData< LatticeStorageSpecification_T >::prepareFlags()
+{
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   // Set interior to origin
+   for (auto it = maskField_.beginXYZ(); it != maskField_.end(); ++it)
+   {
+      maskField_.addFlag(it.cell(), INTERIOR_FLAG);
+   }
+
+   // Set GLs to pass-through
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){
+            maskField_.addFlag(it.cell(), PASS_THROUGH_FLAG);
+         }
+      }
+   }
+}
+
+/**
+ * Resets the origin flag on any ghost layers.
+ */
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformCommData< LatticeStorageSpecification_T >::resetCornerSkippingOriginFlags()
+{
+   const Block * b = dynamic_cast< const Block * >(block_);
+
+   // Remove origin flag from any ghost layers
+   for(auto commDir = CommunicationStencil::beginNoCenter(); commDir != CommunicationStencil::end(); ++commDir){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*commDir);
+      if(!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx)){
+         for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, *commDir); it != maskField_.end(); ++it){
+            maskField_.removeFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG);
+         }
+      }
+   }
+}
+
+#endif
+
+
+/**
+ * Determines whether the current block has the smallest BlockID among all fine blocks of a
+ * given intersection volume.
+ * @tparam LatticeStorageSpecification_T
+ * @param cornerDir
+ * @return
+ */
+template< typename LatticeStorageSpecification_T >
+inline bool NonuniformGPUCommData< LatticeStorageSpecification_T >::haveSmallestIdInIntersection(Vector3<cell_idx_t> cornerDir)
+{
+   const IBlockID& myId = block_->getId();
+   const Block* b = dynamic_cast< const Block* >(block_);
+   return forEachSubdirectionCancel(cornerDir, [&](Vector3< cell_idx_t > dirVec) {
+     const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]);
+     if (b->neighborhoodSectionHasEquallySizedBlock(nSecIdx))
+     {
+        if (b->getNeighbor(nSecIdx, 0).getId() < myId) return false;
+     }
+     return true;
+   });
+}
+
+
+/**
+ * Sets up the feasible space for the given communication direction.
+ * Additionally to the field interior, marks every ghost layer slice corresponding to an adjacent coarse block,
+ * and the corresponding corner as feasible, if that corner also belongs to a coarse block and the current block
+ * has the smallest BlockID participating in the intersection.
+ * @param commDir A communication direction pointing toward an adjacent coarse block
+ */
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setupCornerSkippingOrigins(stencil::Direction commDir)
+{
+#if defined(USE_CELL_INTERVALS)
+   cornerSkippingOriginIntervals_.clear();
+#else
+   resetCornerSkippingOriginFlags();
+#endif
+
+   const Block* b = dynamic_cast< const Block* >(block_);
+   Vector3<cell_idx_t> commDirVec(stencil::cx[commDir], stencil::cy[commDir], stencil::cz[commDir]);
+
+   // Iterate all orthogonal comm directions
+   forEachOrthogonalDirection< CommunicationStencil >(commDirVec, [&](Vector3< cell_idx_t > toSourceVec) {
+      const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(toSourceVec[0], toSourceVec[1], toSourceVec[2]);
+      // Find if there is a coarse block or no block at all in this neighborhood
+      // There are three possibilities: Coarse block, Same-level block or no block
+      // Finer block is not possible because of 2:1 balance
+      if (!b->neighborhoodSectionHasEquallySizedBlock(nSecIdx))
+      {
+         // From this adjacent coarse block (or not-block, for boundary handling), corner skipping must be handled.
+         // Also, if there is no block, boundary handling in that region must be done on only
+         // one of the participating fine blocks.
+         Vector3< cell_idx_t > cornerDirVec = toSourceVec + commDirVec;
+
+         // If the current block has the smallest participating ID...
+         if (haveSmallestIdInIntersection(cornerDirVec))
+         {
+            const stencil::Direction toSourceDir = stencil::vectorToDirection(toSourceVec);
+
+            // ... Mark source GL region as corner skipping origin.
+#if defined(USE_CELL_INTERVALS)
+            CellInterval ci;
+            maskField_.getGhostRegion(toSourceDir, ci, 2);
+            cornerSkippingOriginIntervals_.push_back(ci);
+#else
+            for (auto it = maskField_.beginGhostLayerOnlyXYZ(toSourceDir); it != maskField_.end(); ++it)
+            {
+               maskField_.addFlag(it.cell(), CORNER_SKIPPING_ORIGIN_FLAG);
+            }
+#endif
+         }
+      }
+   });
+}
+
+
+template< typename LatticeStorageSpecification_T >
+inline void NonuniformGPUCommData< LatticeStorageSpecification_T >::setupBitMaskSlice(stencil::Direction commDir, stencil::Direction streamDir)
+{
+   uint_t fIdx = Stencil::idx[streamDir];
+   Cell streamVec(stencil::cx[streamDir], stencil::cy[streamDir], stencil::cz[streamDir]);
+
+#if defined(USE_CELL_INTERVALS)
+   CellInterval commSliceInterval;
+   maskField_.getGhostRegion(commDir, commSliceInterval, 2);
+
+   // Shift back once
+   commSliceInterval.shift(-streamVec);
+
+   // Intersect with interior and set flag on intersection volume
+   CellInterval interiorIntersection(interiorInterval);
+   interiorIntersection.intersect(commSliceInterval);
+   if(!interiorIntersection.empty()){
+      interiorIntersection.shift(streamVec);
+      setFlagOnInterval(interiorIntersection, fIdx);
+   }
+
+   // Intersect with pass-through regions...
+   for(auto passThroughIntersection : std::as_const(passThroughIntervals_)){
+      passThroughIntersection.intersect(commSliceInterval);
+      if(passThroughIntersection.empty()) continue;
+
+      // ... shift back once more ...
+      passThroughIntersection.shift(-streamVec);
+
+      // ... intersect with interior ...
+      interiorIntersection = interiorInterval;
+      interiorIntersection.intersect(passThroughIntersection);
+      if(!interiorIntersection.empty()){
+         interiorIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z());
+         setFlagOnInterval(interiorIntersection, fIdx);
+      }
+
+      // ... and with corner-skipping origin regions
+      for(auto originIntersection : std::as_const(cornerSkippingOriginIntervals_)){
+         originIntersection.intersect(passThroughIntersection);
+         if(!originIntersection.empty()){
+            originIntersection.shift(2*streamVec.x(), 2* streamVec.y(), 2*streamVec.z());
+            setFlagOnInterval(originIntersection, fIdx);
+         }
+      }
+   }
+#else
+   for(auto it = maskField_.beginGhostLayerOnlyXYZ(2, commDir); it != maskField_.end(); ++it){
+      Cell currentCell = it.cell();
+
+      // Shift back once
+      Cell shiftedCell = currentCell - streamVec;
+
+      if (maskField_.isFlagSet(shiftedCell, INTERIOR_FLAG)){
+         maskField_.addFlag(currentCell, IDX_FLAG(fIdx));
+      }
+      else if (maskField_.isFlagSet(shiftedCell, PASS_THROUGH_FLAG)){
+         // Shift back twice
+         shiftedCell -= streamVec;
+         if (maskField_.isPartOfMaskSet(shiftedCell, INTERIOR_FLAG | CORNER_SKIPPING_ORIGIN_FLAG)){
+            maskField_.addFlag(currentCell, IDX_FLAG(fIdx));
+         }
+
+      }
+      // else continue;
+   }
+#endif
+}
+
+/**
+ * Computes the partial coalescence bit mask on the mask field.
+ * Assumes that all flags are already registered at the field, and that the field
+ * has been initialized to zero.
+ */
+template< typename LatticeStorageSpecification_T >
+void NonuniformGPUCommData< LatticeStorageSpecification_T >::computeBitMask()
+{
+#if defined(USE_CELL_INTERVALS)
+   prepareIntervals();
+#else
+   prepareFlags();
+#endif
+
+   const Block* b = dynamic_cast< const Block* >(block_);
+   for(auto commIt = CommunicationStencil::beginNoCenter(); commIt != CommunicationStencil::end(); ++commIt){
+      stencil::Direction commDir = *commIt;
+      const uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(commDir);
+      if(b->neighborhoodSectionHasLargerBlock(nSecIdx)){
+         setupCornerSkippingOrigins(commDir);
+
+         for(uint_t streamDirIdx = 0; streamDirIdx < Stencil::d_per_d_length[commDir]; streamDirIdx++){
+            stencil::Direction streamDir = Stencil::d_per_d[commDir][streamDirIdx];
+            setupBitMaskSlice(commDir, streamDir);
+         }
+      }
+   }
+}
+
+template< typename LatticeStorageSpecification_T >
+void NonuniformGPUCommData< LatticeStorageSpecification_T >::syncDataGPU()
+{
+   gpu::fieldCpy(maskFieldGPU_, maskField_);
+}
+} // walberla::lbm_generated
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6ac87010a6889b899380514ec51d717159bd6f8
--- /dev/null
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
@@ -0,0 +1,332 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGeneratedGPUPdfPackInfo.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedNonUniformGPUPackInfo.h"
+
+#include "lbm_generated/gpu/NonuniformGPUCommData.h"
+#include "lbm_generated/field/PdfField.h"
+
+namespace walberla::lbm_generated
+{
+using stencil::Direction;
+
+namespace internal
+{
+/*
+ * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to
+ * kernels generated for in-place streaming patterns. The generated code should not be templated.
+ */
+template< typename PdfField_T, bool inplace >
+class NonuniformGPUPackingKernelsWrapper
+{
+ public:
+   void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const  = 0;
+   void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const = 0;
+   void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                     CellInterval dstInterval, gpuStream_t stream = nullptr) const                                    = 0;
+
+   void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const  = 0;
+   void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0;
+   void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                           CellInterval dstInterval, Direction dir, gpuStream_t stream) const               = 0;
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
+                           stencil::Direction dir, gpuStream_t stream = nullptr) const = 0;
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci,
+                               unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const                                   = 0;
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir) const                      = 0;
+   void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0;
+
+   uint_t size(CellInterval ci, Direction dir) const                   = 0;
+   uint_t size(CellInterval ci) const                                  = 0;
+   uint_t redistributeSize(CellInterval ci) const                      = 0;
+   uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const = 0;
+};
+
+/*
+ * Template Specialization for two-fields patterns, with trivial method wrappers.
+ */
+template< typename PdfField_T >
+class NonuniformGPUPackingKernelsWrapper< PdfField_T, false >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const
+   {
+      kernels_.packAll(srcField, ci, outBuffer, stream);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const
+   {
+      kernels_.unpackAll(dstField, ci, inBuffer, stream);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                     CellInterval dstInterval, gpuStream_t stream = nullptr) const
+   {
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, stream);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.packDirection(srcField, ci, outBuffer, dir, stream);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, stream);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                           CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream);
+   }
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
+                           stencil::Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, stream);
+   }
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci,
+                               unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, stream);
+   }
+
+   void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, stream);
+   }
+
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      kernels_.zeroCoalescenceRegion(dstField, ci, dir, stream);
+   }
+
+   uint_t size(CellInterval ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval ci) const { return kernels_.size(ci); }
+   uint_t redistributeSize(CellInterval ci) const { return kernels_.redistributeSize(ci); }
+   uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const
+   {
+      return kernels_.partialCoalescenceSize(ci, dir);
+   }
+
+ private:
+   PackingKernels_T kernels_;
+};
+
+/*
+ * Template Specialization for in-place patterns, extracting the timestep from the lattice model.
+ */
+template< typename PdfField_T >
+class NonuniformGPUPackingKernelsWrapper< PdfField_T, true >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packAll(srcField, ci, outBuffer, timestep, stream);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackAll(dstField, ci, inBuffer, timestep, stream);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                     CellInterval dstInterval, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep, stream);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packDirection(srcField, ci, outBuffer, dir, timestep, stream);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep, stream);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
+                           CellInterval dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream);
+   }
+
+   void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
+                           stencil::Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep, stream);
+   }
+
+   void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci,
+                               unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep, stream);
+   }
+
+   void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.zeroCoalescenceRegion(dstField, ci, dir, timestep, stream);
+   }
+
+   void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackCoalescence(dstField, ci, inBuffer, dir, timestep, stream);
+   }
+
+   uint_t size(CellInterval ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval ci) const { return kernels_.size(ci); }
+   uint_t redistributeSize(CellInterval ci) const { return kernels_.redistributeSize(ci); }
+   uint_t partialCoalescenceSize(CellInterval ci, Direction dir) const
+   {
+      return kernels_.partialCoalescenceSize(ci, dir);
+   }
+
+ private:
+   PackingKernels_T kernels_;
+};
+} // namespace internal
+
+/***********************************************************************************************************************
+ *                                                  Class Declaration                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T >
+class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUniformGPUPackInfo
+{
+ public:
+   using VoidFunction                  = std::function< void(gpuStream_t) >;
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+   using Stencil                       = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil          = typename LatticeStorageSpecification_T::CommunicationStencil;
+   using CommData_T                    = NonuniformGPUCommData< LatticeStorageSpecification_T >;
+
+   NonuniformGeneratedGPUPdfPackInfo(const BlockDataID pdfFieldID, const BlockDataID commDataID)
+      : pdfFieldID_(pdfFieldID), commDataID_(commDataID){};
+
+   bool constantDataExchange() const override { return true; };
+   bool threadsafeReceiving() const override { return false; };
+
+   /// Equal Level
+   void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer) override;
+   void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir,
+                                   gpuStream_t stream) override;
+   void getLocalEqualLevelCommFunction(std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver,
+                                       stencil::Direction dir) override;
+
+   /// Coarse to Fine
+   void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir,
+                               GpuBuffer_T& buffer) override;
+   void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override;
+   void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir,
+                                     GpuBuffer_T& buffer, gpuStream_t stream) override;
+   void getLocalCoarseToFineCommFunction(std::vector< VoidFunction >& commFunctions, const Block* coarseSender,
+                                         Block* fineReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override;
+
+   /// Fine to Coarse
+   void prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream = nullptr);
+   void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir,
+                               GpuBuffer_T& buffer) override;
+
+   void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override;
+   void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir,
+                                     GpuBuffer_T& buffer, gpuStream_t stream) override;
+   void getLocalFineToCoarseCommFunction(std::vector< VoidFunction >& commFunctions, const Block* fineSender,
+                                         Block* coarseReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override;
+
+   uint_t sizeEqualLevelSend(const Block* sender, stencil::Direction dir) override;
+   uint_t sizeCoarseToFineSend(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir) override;
+   uint_t sizeFineToCoarseSend(const Block* fineSender, stencil::Direction dir) override;
+
+ protected:
+   void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer) const override;
+
+   void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir,
+                                 GpuBuffer_T& buffer) const override;
+   void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir,
+                                 GpuBuffer_T& buffer) const override;
+
+ private:
+   /// Helper Functions
+   /// As in PdfFieldPackInfo.h
+   Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const;
+   bool areNeighborsInDirection(const Block* block, const BlockID& neighborID,
+                                const Vector3< cell_idx_t > dirVec) const;
+
+   CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir,
+                                        cell_idx_t width) const;
+   bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const;
+
+   void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+                                    std::vector< std::pair< Direction, CellInterval > >& intervals) const;
+   void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+                                  std::vector< std::pair< Direction, CellInterval > >& intervals) const;
+
+   CellInterval getCoarseBlockCoalescenceInterval(const Block* coarseBlock, const BlockID& fineBlockID, Direction dir,
+                                                  const PdfField_T* field) const;
+
+   const BlockDataID pdfFieldID_;
+   internal::NonuniformGPUPackingKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_;
+
+ public:
+   const BlockDataID commDataID_;
+};
+
+/***********************************************************************************************************************
+ *                                                  Factory Functions                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T >
+std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >
+   setupNonuniformGPUPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks,
+                                      const BlockDataID pdfFieldID,
+                                      const std::string& dataIdentifier = "NonuniformGPUCommData");
+
+} // namespace walberla::lbm_generated
+
+#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h"
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..adfbb419a8d3a3c82217fecf974977b28bb2a19b
--- /dev/null
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
@@ -0,0 +1,713 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonuniformGeneratedGPUPdfPackInfo.impl.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "NonuniformGeneratedGPUPdfPackInfo.h"
+
+using namespace walberla::lbm_generated::util;
+
+namespace walberla::lbm_generated {
+
+/***********************************************************************************************************************
+ *                                                  Factory Functions                                                  *
+ **********************************************************************************************************************/
+
+
+/**
+ * Sets up a NonuniformGeneratedPdfPackInfo.
+ *
+ * @tparam LatticeStorageSpecification_T
+ * @tparam PackingKernels_T
+ * @param blocks
+ * @param pdfFieldID
+ * @param dataIdentifier
+ * @return
+ */
+template< typename PdfField_T>
+std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >
+   setupNonuniformGPUPdfCommunication( const std::weak_ptr< StructuredBlockForest > & blocks,
+                                 const BlockDataID pdfFieldID,
+                                 const std::string & dataIdentifier)
+{
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+
+   auto sbf = blocks.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(sbf, "Trying to create Nonuniform GPU Packinfo for a block storage object that doesn't exist anymore" );
+
+   auto handling = std::make_shared<NonuniformGPUCommDataHandling< LatticeStorageSpecification_T > >(blocks);
+   BlockDataID commDataID = sbf->addBlockData(handling, dataIdentifier);
+
+   return std::make_shared<NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >(pdfFieldID, commDataID);
+}
+
+
+/***********************************************************************************************************************
+ *                                          Equal Level Communication                                                  *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver,
+                                                                           Direction dir,
+                                                                           GpuBuffer_T & buffer)
+{
+   auto field = receiver->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   cell_idx_t gls = skipsThroughCoarseBlock(receiver, dir) ? 2 : 1;
+   field->getGhostRegion(dir, ci, gls, false);
+   uint_t size              = kernels_.size(ci, dir);
+   auto bufferPtr = buffer.advanceNoResize(size);
+   kernels_.unpackDirection(field, ci, bufferPtr, dir);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalEqualLevel(
+   const Block* sender, Block* receiver, stencil::Direction dir, gpuStream_t stream)
+{
+   auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval srcRegion;
+   CellInterval dstRegion;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false);
+   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false);
+   kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalEqualLevelCommFunction(
+   std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver,
+   stencil::Direction dir)
+{
+   auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval srcRegion;
+   CellInterval dstRegion;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false);
+   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false);
+
+//   VoidFunction t = std::bind(kernels_.localCopyDirection,
+//                                         srcField, srcRegion, dstField, dstRegion, dir, std::placeholders::_1 );
+
+//   CellInterval test(srcRegion.min(), srcRegion.max());
+//   CellInterval test2(dstRegion.min(), dstRegion.max());
+
+
+   auto commFunction = [this, srcField, srcRegion, dstField, dstRegion, dir](gpuStream_t gpuStream)
+   {
+      kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, gpuStream);
+   };
+   commFunctions.emplace_back(commFunction);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl(
+   const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const
+{
+   auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   field->getSliceBeforeGhostLayer(dir, ci, gls, false);
+   uint_t size              = kernels_.size(ci, dir);
+   auto bufferPtr = buffer.advanceNoResize(size);
+   kernels_.packDirection(field, ci, bufferPtr, dir);
+}
+
+/***********************************************************************************************************************
+ *                                          Coarse to Fine Communication                                               *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl(
+   const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const
+{
+   auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals);
+
+   for (auto t : intervals)
+   {
+      CellInterval ci          = t.second;
+      uint_t size              = kernels_.size(ci);
+      auto bufferPtr = buffer.advanceNoResize(size);
+      kernels_.packAll(field, ci, bufferPtr);
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine(
+   Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer)
+{
+   auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals);
+
+   for (auto t : intervals)
+   {
+      Direction d              = t.first;
+      CellInterval ci          = t.second;
+      uint_t size              = kernels_.redistributeSize(ci);
+      auto bufferPtr = buffer.advanceNoResize(size);
+      kernels_.unpackRedistribute(field, ci, bufferPtr, d);
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine(
+   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir)
+{
+   auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > srcIntervals;
+   getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals);
+
+   std::vector< std::pair< Direction, CellInterval > > dstIntervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals);
+
+   WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size())
+
+   for(size_t index = 0; index < srcIntervals.size(); index++)
+   {
+      CellInterval srcInterval = srcIntervals[index].second;
+
+      Direction const unpackDir      = dstIntervals[index].first;
+      CellInterval dstInterval = dstIntervals[index].second;
+
+      uint_t packSize      = kernels_.size(srcInterval);
+
+#ifndef NDEBUG
+      Direction const packDir        = srcIntervals[index].first;
+      WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
+      uint_t unpackSize = kernels_.redistributeSize(dstInterval);
+      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+      // TODO: This is a dirty workaround. Code-generate direct redistribution!
+      unsigned char *buffer;
+      WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
+      kernels_.packAll(srcField, srcInterval, buffer);
+      kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir);
+      WALBERLA_GPU_CHECK(gpuFree(buffer))
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine(
+   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream)
+{
+   auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > srcIntervals;
+   getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals);
+
+   std::vector< std::pair< Direction, CellInterval > > dstIntervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals);
+
+   WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size())
+
+   for(size_t index = 0; index < srcIntervals.size(); index++)
+   {
+      CellInterval srcInterval = srcIntervals[index].second;
+
+      Direction const unpackDir      = dstIntervals[index].first;
+      CellInterval dstInterval = dstIntervals[index].second;
+
+      uint_t packSize      = kernels_.size(srcInterval);
+
+#ifndef NDEBUG
+      Direction const packDir        = srcIntervals[index].first;
+      WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
+      uint_t unpackSize = kernels_.redistributeSize(dstInterval);
+      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+      auto bufferPtr = buffer.advanceNoResize(packSize);
+      kernels_.packAll(srcField, srcInterval, bufferPtr, stream);
+      kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, stream);
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalCoarseToFineCommFunction(
+   std::vector< VoidFunction >& commFunctions,
+   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer)
+{
+   auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > srcIntervals;
+   getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals);
+
+   std::vector< std::pair< Direction, CellInterval > > dstIntervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals);
+
+   WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size())
+
+   for(size_t index = 0; index < srcIntervals.size(); index++)
+   {
+      CellInterval srcInterval = srcIntervals[index].second;
+
+      Direction const unpackDir      = dstIntervals[index].first;
+      CellInterval dstInterval = dstIntervals[index].second;
+
+      uint_t packSize      = kernels_.size(srcInterval);
+
+#ifndef NDEBUG
+      Direction const packDir        = srcIntervals[index].first;
+      WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
+      uint_t unpackSize = kernels_.redistributeSize(dstInterval);
+      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+      auto bufferPtr = buffer.advanceNoResize(packSize);
+
+      auto commFunction = [this, srcField, srcInterval, bufferPtr, dstField, dstInterval, unpackDir](gpuStream_t gpuStream)
+      {
+         kernels_.packAll(srcField, srcInterval, bufferPtr, gpuStream);
+         kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, gpuStream);
+      };
+      commFunctions.emplace_back(commFunction);
+   }
+}
+
+
+
+/***********************************************************************************************************************
+ *                                          Fine to Coarse Communication                                               *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream)
+{
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+
+   for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
+      if(coarseReceiver->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+         CellInterval ci;
+         dstField->getSliceBeforeGhostLayer(*it, ci, 1);
+         kernels_.zeroCoalescenceRegion(dstField, ci, *it, gpuStream);
+      }
+   }
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataFineToCoarse(
+   Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir,
+   GpuBuffer_T & buffer)
+{
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+
+   CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField);
+   uint_t size = kernels_.size(ci, dir);
+   unsigned char* bufferPtr = buffer.advanceNoResize(size);
+   kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse(
+   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir)
+{
+   auto varFineSender = const_cast< Block * >(fineSender);
+   auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
+   auto srcCommData   = varFineSender->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU());
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+   Direction invDir = stencil::inverseDir[dir];
+
+   CellInterval srcInterval;
+   srcField->getGhostRegion(dir, srcInterval, 2);
+   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
+
+   CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
+                                                                invDir, dstField);
+
+#ifndef NDEBUG
+   uint_t unpackSize = kernels_.size(dstInterval, invDir);
+   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+   // TODO: This is a dirty workaround. Code-generate direct redistribution!
+   unsigned char *buffer;
+   WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
+   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir);
+   kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir);
+   WALBERLA_GPU_CHECK(gpuFree(buffer))
+}
+
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse(
+   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream)
+{
+   auto varFineSender = const_cast< Block * >(fineSender);
+   auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
+   auto srcCommData   = varFineSender->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU());
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+   Direction invDir = stencil::inverseDir[dir];
+
+   CellInterval srcInterval;
+   srcField->getGhostRegion(dir, srcInterval, 2);
+   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
+
+   CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
+                                                                invDir, dstField);
+
+#ifndef NDEBUG
+   uint_t unpackSize = kernels_.size(dstInterval, invDir);
+   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+   auto bufferPtr = buffer.advanceNoResize(packSize);
+   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, stream);
+   kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream);
+}
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalFineToCoarseCommFunction(
+   std::vector< VoidFunction >& commFunctions,
+   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer)
+{
+   auto varFineSender = const_cast< Block * >(fineSender);
+   auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
+   auto srcCommData   = varFineSender->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU());
+   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
+   Direction invDir = stencil::inverseDir[dir];
+
+   CellInterval srcInterval;
+   srcField->getGhostRegion(dir, srcInterval, 2);
+   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
+
+   CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
+                                                                invDir, dstField);
+
+#ifndef NDEBUG
+   uint_t unpackSize = kernels_.size(dstInterval, invDir);
+   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
+#endif
+
+   auto bufferPtr = buffer.advanceNoResize(packSize);
+   auto commFunction = [this, srcField, maskField, srcInterval, bufferPtr, dir, dstField, dstInterval, invDir](gpuStream_t gpuStream)
+   {
+      kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, gpuStream);
+      kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, gpuStream);
+   };
+   commFunctions.emplace_back(commFunction);
+}
+
+
+template< typename PdfField_T>
+uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeEqualLevelSend( const Block * sender, stencil::Direction dir)
+{
+   auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
+   field->getSliceBeforeGhostLayer(dir, ci, gls, false);
+   return kernels_.size(ci, dir);
+}
+
+
+
+template< typename PdfField_T>
+uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir)
+{
+   auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getCoarseBlockCommIntervals(fineReceiver, dir, field, intervals);
+
+   uint_t size = 0;
+
+   for (auto t : intervals)
+   {
+      CellInterval ci          = t.second;
+      size += kernels_.size(ci);
+   }
+   WALBERLA_ASSERT_GREATER(size, 0)
+   return size;
+}
+
+
+
+template< typename PdfField_T>
+uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( const Block * sender, stencil::Direction dir)
+{
+   auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval ci;
+   field->getGhostRegion(dir, ci, 2);
+   return kernels_.partialCoalescenceSize(ci, dir);
+}
+
+
+
+template< typename PdfField_T>
+void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl(
+   const Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir,
+   GpuBuffer_T & buffer) const
+{
+   auto varBlock = const_cast< Block* >(fineSender);
+   auto srcField   = varBlock->getData< PdfField_T >(pdfFieldID_);
+   auto commData  = varBlock->getData< CommData_T >(commDataID_);
+   PartialCoalescenceMaskFieldGPU * maskField = &(commData->getMaskFieldGPU());
+
+   CellInterval ci;
+   srcField->getGhostRegion(dir, ci, 2);
+   uint_t size = kernels_.partialCoalescenceSize(ci, dir);
+   auto bufferPtr = buffer.advanceNoResize(size);
+   kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir);
+}
+
+/***********************************************************************************************************************
+ *                                                  Helper Functions                                                   *
+ **********************************************************************************************************************/
+
+template< typename PdfField_T>
+inline Vector3< cell_idx_t >
+   NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getNeighborShift(const BlockID& fineBlock,
+                                                                     stencil::Direction dir) const
+{
+   // dir: direction from coarse to fine block, or vice versa
+   Vector3< cell_idx_t > shift;
+
+   uint_t const branchId = fineBlock.getBranchId();
+
+   shift[0] = (stencil::cx[dir] == 0) ? (((branchId & uint_t(1)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+              cell_idx_t(0);
+   shift[1] = (stencil::cy[dir] == 0) ? (((branchId & uint_t(2)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+              cell_idx_t(0);
+   shift[2] = (Stencil::D == uint_t(3)) ?
+              ((stencil::cz[dir] == 0) ? (((branchId & uint_t(4)) == uint_t(0)) ? cell_idx_t(-1) : cell_idx_t(1)) :
+               cell_idx_t(0)) :
+              cell_idx_t(0);
+
+   return shift;
+}
+
+/**
+ * Returns the part of a cell interval's hull of given width in direction dirVec.
+ * @param ci        The original cell interval
+ * @param dirVec    Direction Vector
+ * @param width     Width of the hull
+ * @return          Interval forming the part of the hull
+ */
+template< typename PdfField_T>
+inline CellInterval NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::intervalHullInDirection(
+   const CellInterval& ci, const Vector3< cell_idx_t > dirVec, cell_idx_t width) const
+{
+   CellInterval result(ci);
+   for (uint_t i = 0; i < Stencil::D; i++)
+   {
+      if (dirVec[i] == 1)
+      {
+         result.min()[i] = result.max()[i] + cell_idx_t(1);
+         result.max()[i] += width;
+      }
+      if (dirVec[i] == -1)
+      {
+         result.max()[i] = result.min()[i] - cell_idx_t(1);
+         result.min()[i] -= width;
+      }
+   }
+
+   return result;
+}
+
+/**
+ * For edge or corner directions, checks if a coarser block is part of the respective edge or corner intersection.
+ * @param block The local block
+ * @param dir   The direction to check
+ * @return      `true`  if dir is an edge or corner direction skipping through a coarser block.
+ */
+template< typename PdfField_T>
+inline bool NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::skipsThroughCoarseBlock(
+   const Block* block, const Direction dir) const
+{
+   Vector3< cell_idx_t > dirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]);
+   bool coarseBlockFound = false;
+   forEachSubdirectionCancel(dirVec, [&](Vector3< cell_idx_t > subdir) {
+     coarseBlockFound =
+        coarseBlockFound || block->neighborhoodSectionHasLargerBlock(
+           blockforest::getBlockNeighborhoodSectionIndex(subdir[0], subdir[1], subdir[2]));
+     return !coarseBlockFound;
+   });
+
+   return coarseBlockFound;
+}
+
+/**
+ * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval)
+ * mapping sub-directions of the communication direction to cell intervals on the coarse block interior
+ * whose data must be communicated <i>as if</i> communicating in those sub-directions.
+ * @param fineBlockID   ID of the fine block
+ * @param dir           Direction from the coarse to the fine block
+ * @param field         Pointer to the PDF field on the coarse block
+ * @param intervals     Vector that will be filled with the computed intervals
+ */
+template< typename PdfField_T>
+inline void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getCoarseBlockCommIntervals(
+   const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   std::vector< std::pair< Direction, CellInterval > >& intervals) const
+{
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir);
+
+   CellInterval mainSlice;
+   field->getSliceBeforeGhostLayer(dir, mainSlice, 1, false);
+
+   // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift
+   for (uint_t i = 0; i != Stencil::D; ++i)
+   {
+      if (shift[i] == cell_idx_t(-1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1);
+      }
+      if (shift[i] == cell_idx_t(1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2));
+      }
+   }
+
+   intervals.emplace_back(dir, mainSlice);
+
+   Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] };
+
+   // Get extended slices in all tangential directions for the diagonal part of communication
+   forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) {
+     CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(1));
+     Direction subCommDir      = stencil::vectorToDirection(commDirVec - t);
+     if(CommunicationStencil::containsDir(subCommDir)){
+        intervals.emplace_back(subCommDir, hullInterval);
+     }
+   });
+}
+
+/**
+ * For coarse-to-fine and fine-to-coarse communication, returns a list of pairs (Direction, CellInterval)
+ * mapping sub-directions of the communication direction to cell intervals on the fine block whose data must
+ * be communicated <i>as if</i> communicating in those sub-directions.
+ * @param fineBlockID   ID of the fine block
+ * @param dir           Direction from the fine to the coarse block
+ * @param field         Pointer to the PDF Field on the fine block
+ * @param intervals     Vector that will be filled with the computed intervals
+ */
+template< typename PdfField_T>
+inline void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getFineBlockCommIntervals(
+   const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   std::vector< std::pair< Direction, CellInterval > >& intervals) const
+{
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, dir);
+
+   CellInterval mainSlice;
+   field->getGhostRegion(dir, mainSlice, 2, false);
+   intervals.emplace_back(dir, mainSlice);
+
+   Vector3< cell_idx_t > const commDirVec{ stencil::cx[dir], stencil::cy[dir], stencil::cz[dir] };
+
+   forEachSubdirection(-shift, [&](Vector3< cell_idx_t > t) {
+     CellInterval hullInterval = intervalHullInDirection(mainSlice, t, cell_idx_t(2));
+     Direction subCommDir      = stencil::vectorToDirection(commDirVec + t);
+     if(CommunicationStencil::containsDir(subCommDir)){
+        intervals.emplace_back(subCommDir, hullInterval);
+     }
+   });
+}
+/**
+ * Checks whether or not the block with ID `neighborID` is a neighbor of `block` in direction `dir`.
+ */
+template< typename PdfField_T>
+bool NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::areNeighborsInDirection(
+   const Block* block, const BlockID& neighborID, const Vector3< cell_idx_t> dirVec) const
+{
+   uint_t const nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(dirVec[0], dirVec[1], dirVec[2]);
+   uint_t const nSecSize = block->getNeighborhoodSectionSize(nSecIdx);
+
+   for(uint_t i = 0; i < nSecSize; i++){
+      if(block->getNeighborId(nSecIdx, i) == neighborID){
+         return true;
+      }
+   }
+   return false;
+}
+
+template< typename PdfField_T>
+CellInterval NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getCoarseBlockCoalescenceInterval(
+   const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, const PdfField_T* field) const
+{
+   Direction mainDir(dir);
+   Vector3< cell_idx_t > commDirVec(stencil::cx[dir], stencil::cy[dir], stencil::cz[dir]);
+   Vector3< cell_idx_t > mainDirVec(commDirVec);
+   bool isAsymmetric = !areNeighborsInDirection(coarseBlock, fineBlockID, commDirVec);
+
+   // If asymmetric, find the main subdirection
+   if(isAsymmetric){
+      mainDirVec = Vector3< cell_idx_t >(0);
+      forEachSubdirection(commDirVec, [&](Vector3< cell_idx_t > subdirVec){
+         if(areNeighborsInDirection(coarseBlock, fineBlockID, subdirVec)){
+            // -dir is one main communication direction from F to C, but, due to periodicity,
+            // it might not be the only one. Find the main comm direction from the subdirections
+            // that is largest in the 1-norm.
+            if(subdirVec.sqrLength() > mainDirVec.sqrLength()) mainDirVec = subdirVec;
+         }
+      });
+      mainDir = stencil::vectorToDirection(mainDirVec);
+   }
+
+   Vector3< cell_idx_t > shift = getNeighborShift(fineBlockID, mainDir);
+
+   CellInterval mainSlice;
+   field->getSliceBeforeGhostLayer(mainDir, mainSlice, 1, false);
+
+   // In all directions, restrict the slice to the lower or upper half, depending on neighbor shift
+   for (uint_t i = 0; i != Stencil::D; ++i)
+   {
+      if (shift[i] == cell_idx_t(-1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.max()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2)) - cell_idx_t(1);
+      }
+      if (shift[i] == cell_idx_t(1))
+      {
+         WALBERLA_ASSERT_EQUAL(mainSlice.size(i) & 1, 0)
+         mainSlice.min()[i] = mainSlice.min()[i] + cell_idx_c(mainSlice.size(i) / uint_t(2));
+      }
+   }
+
+   CellInterval commSlice(mainSlice);
+
+   // If asymmetric, find coalescence slice as hull of main slice
+   if(isAsymmetric){
+      commSlice = intervalHullInDirection(mainSlice, mainDirVec - commDirVec, 1);
+   }
+
+   return commSlice;
+}
+
+} // walberla::lbm_generated
diff --git a/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..894eb38034881feeda40c1a3d051455cbe98e173
--- /dev/null
+++ b/src/lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h
@@ -0,0 +1,272 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGeneratedGPUPdfPackInfo.h
+//! \ingroup lbm
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \brief Class Template for Lattice Boltzmann PDF Pack Infos using code-generated kernels
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+
+#include "gpu/GPUWrapper.h"
+#include "gpu/communication/GeneratedGPUPackInfo.h"
+
+#include "lbm/field/PdfField.h"
+
+#include "stencil/Directions.h"
+
+namespace walberla
+{
+using gpu::GeneratedGPUPackInfo;
+
+namespace lbm_generated
+{
+using stencil::Direction;
+
+namespace internal
+{
+/*
+ * Base Template for Packing Kernels Wrapper. This wrapper is required for passing the time step to
+ * kernels generated for in-place streaming patterns. The generated code should not be templated.
+ */
+template< typename PdfField_T, bool inplace >
+class UniformPackingGPUKernelsWrapper
+{
+ public:
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const  = 0;
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const = 0;
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval,
+                     gpuStream_t stream) const                                                               = 0;
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir,
+                      gpuStream_t stream) const                                                = 0;
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir,
+                        gpuStream_t stream) const                                              = 0;
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir, gpuStream_t stream) const = 0;
+
+   uint_t size(CellInterval& ci, Direction dir) const = 0;
+   uint_t size(CellInterval& ci) const                = 0;
+};
+
+/*
+ * Template Specialization for two-fields patterns, with trivial method wrappers.
+ */
+template< typename PdfField_T >
+class UniformPackingGPUKernelsWrapper< PdfField_T, false >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const
+   {
+      kernels_.packAll(srcField, ci, outBuffer, stream);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const
+   {
+      kernels_.unpackAll(dstField, ci, inBuffer, stream);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval,
+                     gpuStream_t stream) const
+   {
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, stream);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir,
+                      gpuStream_t stream) const
+   {
+      kernels_.packDirection(srcField, ci, outBuffer, dir, stream);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir,
+                        gpuStream_t stream) const
+   {
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, stream);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream);
+   }
+
+   uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+
+ private:
+   PackingKernels_T kernels_;
+};
+
+/*
+ * Template Specialization for in-place patterns, extracting the timestep from the lattice model.
+ */
+template< typename PdfField_T >
+class UniformPackingGPUKernelsWrapper< PdfField_T, true >
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+
+   void packAll(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packAll(srcField, ci, outBuffer, timestep, stream);
+   }
+
+   void unpackAll(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, gpuStream_t stream) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackAll(dstField, ci, inBuffer, timestep, stream);
+   }
+
+   void localCopyAll(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField, CellInterval& dstInterval,
+                     gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyAll(srcField, srcInterval, dstField, dstInterval, timestep, stream);
+   }
+
+   void packDirection(PdfField_T* srcField, CellInterval& ci, unsigned char* outBuffer, Direction dir,
+                      gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      kernels_.packDirection(srcField, ci, outBuffer, dir, timestep, stream);
+   }
+
+   void unpackDirection(PdfField_T* dstField, CellInterval& ci, unsigned char* inBuffer, Direction dir,
+                        gpuStream_t stream) const
+   {
+      uint8_t timestep = dstField->getTimestep();
+      kernels_.unpackDirection(dstField, ci, inBuffer, dir, timestep, stream);
+   }
+
+   void localCopyDirection(PdfField_T* srcField, CellInterval& srcInterval, PdfField_T* dstField,
+                           CellInterval& dstInterval, Direction dir, gpuStream_t stream) const
+   {
+      uint8_t timestep = srcField->getTimestep();
+      WALBERLA_ASSERT_EQUAL(timestep, dstField->getTimestep())
+      kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream);
+   }
+
+   uint_t size(CellInterval& ci, Direction dir) const { return kernels_.size(ci, dir); }
+   uint_t size(CellInterval& ci) const { return kernels_.size(ci); }
+
+ private:
+   PackingKernels_T kernels_;
+};
+} // namespace internal
+
+/**
+ * Pack Info class template for lattice Boltzmann PDF fields. Relies on a code-generated
+ * class providing kernel implementations for packing, unpacking and local copying of data.
+ *
+ * This template relies on a PackingKernels implementation generated by lbmpy_walberla.packing_kernels.
+ * The code generated part provides the kernels for transferring data between communication buffers
+ * and fields. The iteration slices are constructed by this class.
+ *
+ * The code-generated substructure enables the usage of arbitrary, in particular in-place streaming
+ * patterns.
+ *
+ * @tparam  PackingKernels_T Type of a PackingKernels implementation generated using
+ *          `lbmpy_walberla.generate_packing_kernels`.
+ *
+ * \ingroup lbm
+ */
+template< typename PdfField_T >
+class UniformGeneratedGPUPdfPackInfo : public GeneratedGPUPackInfo
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
+   using Stencil                       = typename LatticeStorageSpecification_T::Stencil;
+
+   UniformGeneratedGPUPdfPackInfo(const BlockDataID pdfFieldID, cell_idx_t cellLayersToSend = 1, bool sendAll = false)
+      : pdfFieldID_(pdfFieldID), ghostLayersToSend_(cellLayersToSend), sendAll_(sendAll)
+   {}
+
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override;
+   void communicateLocal(stencil::Direction dir, const IBlock* sender, IBlock* receiver, gpuStream_t stream) override;
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override;
+   uint_t size(stencil::Direction dir, IBlock* block) override;
+
+ private:
+   const BlockDataID pdfFieldID_;
+   internal::UniformPackingGPUKernelsWrapper< PdfField_T, LatticeStorageSpecification_T::inplace > kernels_;
+   cell_idx_t ghostLayersToSend_;
+   bool sendAll_;
+};
+
+template< typename PdfField_T >
+void UniformGeneratedGPUPdfPackInfo< PdfField_T >::unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block,
+                                                          gpuStream_t stream)
+{
+   auto field = block->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   field->getGhostRegion(dir, ci, ghostLayersToSend_, false);
+
+   if (sendAll_) { kernels_.unpackAll(field, ci, buffer, stream); }
+   else { kernels_.unpackDirection(field, ci, buffer, dir, stream); }
+}
+
+template< typename PdfField_T >
+void UniformGeneratedGPUPdfPackInfo< PdfField_T >::pack(stencil::Direction dir, unsigned char* buffer, IBlock* block,
+                                                        gpuStream_t stream)
+{
+   auto field = const_cast< IBlock* >(block)->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   field->getSliceBeforeGhostLayer(dir, ci, ghostLayersToSend_, false);
+
+   if (sendAll_) { kernels_.packAll(field, ci, buffer, stream); }
+   else { kernels_.packDirection(field, ci, buffer, dir, stream); }
+}
+
+template< typename PdfField_T >
+void UniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocal(stencil::Direction dir, const IBlock* sender,
+                                                                    IBlock* receiver, gpuStream_t stream)
+{
+   auto srcField = const_cast< IBlock* >(sender)->getData< PdfField_T >(pdfFieldID_);
+   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
+
+   CellInterval srcRegion;
+   CellInterval dstRegion;
+   srcField->getSliceBeforeGhostLayer(dir, srcRegion, ghostLayersToSend_, false);
+   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, ghostLayersToSend_, false);
+
+   if (sendAll_) { kernels_.localCopyAll(srcField, srcRegion, dstField, dstRegion, stream); }
+   else { kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream); }
+}
+
+template< typename PdfField_T >
+uint_t UniformGeneratedGPUPdfPackInfo< PdfField_T >::size(stencil::Direction dir, IBlock* block)
+{
+   auto field = block->getData< PdfField_T >(pdfFieldID_);
+   CellInterval ci;
+   field->getGhostRegion(dir, ci, 1, false);
+
+   uint_t elementsPerCell = kernels_.size(ci, dir);
+   return elementsPerCell;
+}
+
+} // namespace lbm_generated
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c6fdc828efd635abda7775bc46e078240d0f4b6
--- /dev/null
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.h
@@ -0,0 +1,97 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicRecursiveTimeStep.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/communication/NonUniformBufferedScheme.h"
+
+#include "lbm/field/PdfField.h"
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+namespace walberla {
+
+using blockforest::communication::NonUniformBufferedScheme;
+
+namespace lbm_generated {
+
+/**
+ *
+ * @tparam LatticeStorageSpecification_T   Generated storage specification
+ * @tparam SweepCollection_T LBM SweepCollection (must be able to call stream, collide, streamCollide and streamOnlyNoAdvancement)
+ * @tparam BoundaryCollection_T LBM Boundary collection (Functor that runs all boundary kernels at call)
+ */
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T>
+class BasicRecursiveTimeStep
+{
+ public:
+   using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
+   using Stencil = typename LatticeStorageSpecification_T::Stencil;
+   using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil;
+   using CommScheme = NonUniformBufferedScheme< CommunicationStencil >;
+   using PackInfo = lbm_generated::NonuniformGeneratedPdfPackInfo< PdfField_T >;
+
+   BasicRecursiveTimeStep(std::shared_ptr< StructuredBlockForest > & sbfs,
+                          const BlockDataID & pdfFieldId, SweepCollection_T & sweepCollection, BoundaryCollection_T & boundaryCollection,
+                          std::shared_ptr< CommScheme > & commScheme, std::shared_ptr< PackInfo > & pdfFieldPackInfo):
+      sbfs_(sbfs), pdfFieldId_(pdfFieldId), pdfFieldPackInfo_(pdfFieldPackInfo), commScheme_(commScheme),
+      sweepCollection_(sweepCollection), boundaryCollection_(boundaryCollection)
+      {
+#ifndef NDEBUG
+      for (auto& block : *sbfs)
+         WALBERLA_ASSERT(block.isDataOfType<PdfField_T>(pdfFieldId_), "Template parameter PdfField_T is of different type than BlockDataID pdfFieldId that is provided as constructor argument")
+#endif
+      maxLevel_ = sbfs->getDepth();
+
+      for (uint_t level = 0; level <= maxLevel_; level++)
+      {
+         std::vector<Block *> blocks;
+         sbfs->getBlocks(blocks, level);
+         blocks_.push_back(blocks);
+      }
+     };
+
+   void operator() () { timestep(0); };
+   void addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level=0);
+
+ private:
+   void timestep(uint_t level);
+   void ghostLayerPropagation(Block * block);
+   std::function<void()> executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation=false);
+   std::function<void()> executeBoundaryHandlingOnLevel(uint_t level);
+
+   std::shared_ptr< StructuredBlockForest > sbfs_;
+   uint_t maxLevel_;
+   std::vector<std::vector<Block *>> blocks_;
+
+   const BlockDataID pdfFieldId_;
+   std::shared_ptr< PackInfo > pdfFieldPackInfo_;
+   std::shared_ptr< CommScheme > commScheme_;
+
+   SweepCollection_T & sweepCollection_;
+   BoundaryCollection_T & boundaryCollection_;
+};
+
+} // namespace lbm_generated
+} // namespace walberla
+
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.impl.h"
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..29f7de7657e6d923b216d1ed4eae0229325a3762
--- /dev/null
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -0,0 +1,266 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicRecursiveTimeStep.impl.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "BasicRecursiveTimeStep.h"
+
+namespace walberla {
+namespace lbm_generated {
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::timestep(uint_t level)
+{
+   // 1.1 Collision
+   for(auto b: blocks_[level]){
+      sweepCollection_.streamCollide(b);
+   }
+
+   // 1.2 Recursive Descent
+   if(level < maxLevel_){
+      timestep(level + 1);
+   }
+
+   // 1.3 Coarse to Fine Communication, receiving end
+   if(level != 0){
+      commScheme_->communicateCoarseToFine(level);
+   }
+
+   // 1.4 Equal-Level Communication
+   commScheme_->communicateEqualLevel(level);
+
+   // 1.5 Boundary Handling and Coalescence Preparation
+   for(auto b : blocks_[level]){
+      boundaryCollection_(b);
+      if(level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b);
+   }
+
+   // 1.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_){
+      commScheme_->communicateFineToCoarse(level + 1);
+   }
+
+   // Stop here if on coarsest level.
+   // Otherwise, continue to second subcycle.
+   if(level == 0) return;
+
+   // 2.1 Collision and Ghost-Layer Propagation
+   for(auto b: blocks_[level]){
+      ghostLayerPropagation(b);  // GL-Propagation first without swapping arrays...
+      sweepCollection_.streamCollide(b);                // then Stream-Collide on interior, and swap arrays
+   }
+
+   // 2.2 Recursive Descent
+   if(level < maxLevel_){
+      timestep(level + 1);
+   }
+
+   // 2.4 Equal-Level Communication
+   commScheme_->communicateEqualLevel(level);
+
+   // 2.5 Boundary Handling and Coalescence Preparation
+   for(auto b : blocks_[level]){
+      boundaryCollection_(b);
+      if(level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b);
+   }
+
+   // 2.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_){
+      commScheme_->communicateFineToCoarse(level + 1);
+   }
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::addRefinementToTimeLoop(SweepTimeloop & timeloop, uint_t level)
+{
+   // 1.1 Collision
+   timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level), "Refinement Cycle: streamCollide on level " + std::to_string(level));
+
+   // 1.2 Recursive Descent
+   if(level < maxLevel_){
+      addRefinementToTimeLoop(timeloop, level + 1);
+   }
+
+   // 1.3 Coarse to Fine Communication, receiving end
+   if(level != 0){
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateCoarseToFineFunctor(level), "Refinement Cycle: communicate coarse to fine on level " + std::to_string(level));
+   }
+
+   // 1.4 Equal-Level Communication
+   timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+
+   // 1.5 Boundary Handling and Coalescence Preparation
+   timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 1.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_){
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+   }
+
+   // Stop here if on coarsest level.
+   // Otherwise, continue to second subcycle.
+   if(level == 0) return;
+
+   // 2.1 Collision and Ghost-Layer Propagation
+   timeloop.addFuncBeforeTimeStep(executeStreamCollideOnLevel(level, true), "Refinement Cycle: streamCollide with ghost layer propagation on level " + std::to_string(level));
+
+   // 2.2 Recursive Descent
+   if(level < maxLevel_)
+      addRefinementToTimeLoop(timeloop, level + 1);
+
+
+   // 2.4 Equal-Level Communication
+   timeloop.addFuncBeforeTimeStep(commScheme_->communicateEqualLevelFunctor(level), "Refinement Cycle: communicate equal level on level " + std::to_string(level));
+
+   // 2.5 Boundary Handling and Coalescence Preparation
+   timeloop.addFuncBeforeTimeStep(executeBoundaryHandlingOnLevel(level), "Refinement Cycle: boundary handling on level " + std::to_string(level));
+
+   // 2.6 Fine to Coarse Communication, receiving end
+   if(level < maxLevel_)
+      timeloop.addFuncBeforeTimeStep(commScheme_->communicateFineToCoarseFunctor(level + 1), "Refinement Cycle: communicate fine to coarse on level " + std::to_string(level + 1));
+
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+std::function<void()> BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeStreamCollideOnLevel(uint_t level, bool withGhostLayerPropagation)
+{
+   return [level, withGhostLayerPropagation, this]()
+   {
+      if (withGhostLayerPropagation)
+      {
+         for(auto b: blocks_[level]){
+            ghostLayerPropagation(b);
+            sweepCollection_.streamCollide(b);
+         }
+      }
+      else
+      {
+         for(auto b: blocks_[level]){
+            sweepCollection_.streamCollide(b);
+         }
+      }
+   };
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+std::function<void()>  BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::executeBoundaryHandlingOnLevel(uint_t level)
+{
+   return [level, this]() {
+      for (auto b : blocks_[level])
+      {
+         boundaryCollection_(b);
+         if (level != maxLevel_) pdfFieldPackInfo_->prepareCoalescence(b);
+      }
+   };
+}
+
+
+template< typename PdfField_T, typename SweepCollection_T, typename BoundaryCollection_T >
+void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T >::ghostLayerPropagation(
+   Block * block)
+{
+   auto pdfField = block->getData<PdfField_T>(pdfFieldId_);
+
+   for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
+      uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
+      // Propagate on ghost layers shadowing coarse or no blocks
+      if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+         CellInterval ci;
+         pdfField->getGhostRegion(*it, ci, 1);
+         sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci);
+      }
+   }
+}
+
+// Refinement Timestep from post collision state:
+//template< typename PdfField_T, typename LbSweep_T >
+//void BasicRecursiveTimeStep< PdfField_T, LbSweep_T >::timestep(uint_t level)
+//{
+//   std::vector<Block *> blocks;
+//   sbfs_->getBlocks(blocks, level);
+//
+//   uint_t maxLevel = sbfs_->getDepth();
+//
+//   // 1.1 Equal-Level Communication
+//   commScheme_->communicateEqualLevel(level);
+//
+//   // 1.2 Coarse to Fine Communication
+//   if(level < maxLevel){
+//      commScheme_->communicateCoarseToFine(level + 1);
+//   }
+//
+//   // 1.3 Boundary Handling and
+//   // 1.4 Prepare Coalescence (which happens during the recursive descent)
+//   for(auto b : blocks){
+//      boundaryFunctor_(b);
+//      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
+//   }
+//
+//   // 1.5 Recursive Descent
+//   if(level < maxLevel){
+//      timestep(level + 1);
+//   }
+//
+//   // 1.6 First Collision and ghost-layer propagation
+//   for(auto b: blocks){
+//      if(level != 0) ghostLayerPropagation(b);  // GL-Propagation first without swapping arrays...
+//      sweepCollection_.streamCollide(b);                // then Stream-Collide on interior, and swap arrays
+//   }
+//
+//   // Stop here if on coarsest level.
+//   // Otherwise, continue to second subcycle.
+//   if(level == 0) return;
+//
+//   // 2.1 Equal-Level Communication
+//   commScheme_->communicateEqualLevel(level);
+//
+//   // 2.2 Coarse to Fine Communication
+//   if(level < maxLevel){
+//      commScheme_->communicateCoarseToFine(level + 1);
+//   }
+//
+//   // 2.3 Boundary Handling and
+//   // 2.4 Prepare Coalescence (which happens during the recursive descent)
+//   for(auto b : blocks){
+//      boundaryFunctor_(b);
+//      if(level != maxLevel) pdfFieldPackInfo_->prepareCoalescence(b);
+//   }
+//
+//   // 2.5 Recursive Descent
+//   if(level < maxLevel){
+//      timestep(level + 1);
+//   }
+//
+//   // 2.6 Fine to Coarse Communication
+//   commScheme_->communicateFineToCoarse(level);
+//
+//   // 2.7 Second Collision
+//   for(auto b: blocks){
+//      sweepCollection_.streamCollide(b);
+//   }
+//}
+
+} // namespace lbm_generated
+} // namespace walberla
diff --git a/src/lbm_generated/refinement/CMakeLists.txt b/src/lbm_generated/refinement/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..216b4a2683ebc426c8f30a5135d5c07db3409640
--- /dev/null
+++ b/src/lbm_generated/refinement/CMakeLists.txt
@@ -0,0 +1,6 @@
+target_sources( lbm_generated
+    PRIVATE
+    BasicRecursiveTimeStep.h
+    BasicRecursiveTimeStep.impl.h
+    RefinementScaling.h
+    )
diff --git a/src/lbm_generated/refinement/RefinementScaling.h b/src/lbm_generated/refinement/RefinementScaling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8015946a4816e4c0e7c54ea43d2f310755aaec3
--- /dev/null
+++ b/src/lbm_generated/refinement/RefinementScaling.h
@@ -0,0 +1,63 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file RefinementScaling.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/BlockDataHandling.h"
+
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+namespace walberla
+{
+namespace lbm_generated
+{
+
+class DefaultRefinementScaling : public blockforest::AlwaysInitializeBlockDataHandling< real_t >
+{
+ public:
+   DefaultRefinementScaling(const weak_ptr< StructuredBlockStorage >& blocks, const real_t parameter)
+      : blocks_(blocks), parameter_(parameter){};
+
+   real_t* initialize(IBlock* const block) override
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block)
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      level_ = block->getBlockStorage().getLevel(*block);
+
+      const real_t level_scale_factor = real_c(uint_t(1) << level_);
+      const real_t one                = real_c(1.0);
+      const real_t half               = real_c(0.5);
+
+      return new real_t(parameter_ / (level_scale_factor * (-parameter_ * half + one) + parameter_ * half));
+   }
+   bool operator==(const DefaultRefinementScaling& other) const { return level_ == other.level_; }
+
+ private:
+   const weak_ptr< StructuredBlockStorage > blocks_;
+   const real_t parameter_;
+
+   uint_t level_;
+};
+
+} // namespace lbm_generated
+} // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/CMakeLists.txt b/src/lbm_generated/storage_specification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..83d211632ca9366f1ac5f719a22d217f7c176061
--- /dev/null
+++ b/src/lbm_generated/storage_specification/CMakeLists.txt
@@ -0,0 +1,7 @@
+target_sources( lbm_generated
+        PRIVATE
+        D3Q19StorageSpecification.h
+        D3Q19StorageSpecification.cpp
+        D3Q27StorageSpecification.h
+        D3Q27StorageSpecification.cpp
+        )
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f36797eecca7282cf1f615492ac54cee38be871f
--- /dev/null
+++ b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.cpp
@@ -0,0 +1,1939 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q19StorageSpecification.cpp
+//! \\author lbmpy
+//======================================================================================================================
+
+#include "D3Q19StorageSpecification.h"
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+/*************************************************************************************
+ *                                Kernel Definitions
+*************************************************************************************/
+namespace internal_d3q19storagespecification_pack_ALL {
+static FUNC_PREFIX void d3q19storagespecification_pack_ALL(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 1] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 3] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 4] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 5] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 6] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 7] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 8] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 9] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 10] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 11] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 12] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 13] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 14] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 15] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 16] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 17] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[19*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 19*_size_pdfs_src_2*ctr_1 + 19*ctr_2 + 18] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_ALL {
+static FUNC_PREFIX void d3q19storagespecification_unpack_ALL(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0;
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30;
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2];
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 1];
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 2];
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 3];
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 4];
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 5];
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 6];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 7];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 8];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 9];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 10];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 11];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 12];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 13];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 14];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 15];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 16];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 17];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[19*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 19*_size_pdfs_dst_2*ctr_1 + 19*ctr_2 + 18];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_ALL {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_ALL(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0;
+      double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0;
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30;
+         double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30;
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19storagespecification_pack_TE {
+static FUNC_PREFIX void d3q19storagespecification_pack_TE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_SW {
+static FUNC_PREFIX void d3q19storagespecification_pack_SW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_T {
+static FUNC_PREFIX void d3q19storagespecification_pack_T(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_BS {
+static FUNC_PREFIX void d3q19storagespecification_pack_BS(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_TN {
+static FUNC_PREFIX void d3q19storagespecification_pack_TN(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_BW {
+static FUNC_PREFIX void d3q19storagespecification_pack_BW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_N {
+static FUNC_PREFIX void d3q19storagespecification_pack_N(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_E {
+static FUNC_PREFIX void d3q19storagespecification_pack_E(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_NW {
+static FUNC_PREFIX void d3q19storagespecification_pack_NW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_NE {
+static FUNC_PREFIX void d3q19storagespecification_pack_NE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_TW {
+static FUNC_PREFIX void d3q19storagespecification_pack_TW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_BE {
+static FUNC_PREFIX void d3q19storagespecification_pack_BE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_W {
+static FUNC_PREFIX void d3q19storagespecification_pack_W(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_S {
+static FUNC_PREFIX void d3q19storagespecification_pack_S(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_SE {
+static FUNC_PREFIX void d3q19storagespecification_pack_SE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_TS {
+static FUNC_PREFIX void d3q19storagespecification_pack_TS(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_BN {
+static FUNC_PREFIX void d3q19storagespecification_pack_BN(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_pack_B {
+static FUNC_PREFIX void d3q19storagespecification_pack_B(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 1] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 3] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[5*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 5*_size_pdfs_src_2*ctr_1 + 5*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_BW {
+static FUNC_PREFIX void d3q19storagespecification_unpack_BW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_N {
+static FUNC_PREFIX void d3q19storagespecification_unpack_N(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_SE {
+static FUNC_PREFIX void d3q19storagespecification_unpack_SE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_TE {
+static FUNC_PREFIX void d3q19storagespecification_unpack_TE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_T {
+static FUNC_PREFIX void d3q19storagespecification_unpack_T(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_TS {
+static FUNC_PREFIX void d3q19storagespecification_unpack_TS(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_BE {
+static FUNC_PREFIX void d3q19storagespecification_unpack_BE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_NW {
+static FUNC_PREFIX void d3q19storagespecification_unpack_NW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_NE {
+static FUNC_PREFIX void d3q19storagespecification_unpack_NE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_BS {
+static FUNC_PREFIX void d3q19storagespecification_unpack_BS(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_E {
+static FUNC_PREFIX void d3q19storagespecification_unpack_E(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_S {
+static FUNC_PREFIX void d3q19storagespecification_unpack_S(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_W {
+static FUNC_PREFIX void d3q19storagespecification_unpack_W(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_SW {
+static FUNC_PREFIX void d3q19storagespecification_unpack_SW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_B {
+static FUNC_PREFIX void d3q19storagespecification_unpack_B(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 1];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 3];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[5*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 5*_size_pdfs_dst_2*ctr_1 + 5*ctr_2 + 4];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_TN {
+static FUNC_PREFIX void d3q19storagespecification_unpack_TN(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_BN {
+static FUNC_PREFIX void d3q19storagespecification_unpack_BN(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_unpack_TW {
+static FUNC_PREFIX void d3q19storagespecification_unpack_TW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_NE {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_NE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_TS {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_TS(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_BE {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_BE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_BS {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_BS(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_BW {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_BW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_T {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_T(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_TN {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_TN(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_W {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_W(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_E {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_E(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_TW {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_TW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_SW {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_SW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_NW {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_NW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_BN {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_BN(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_TE {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_TE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_B {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_B(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_N {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_N(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_S {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_S(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q19storagespecification_localCopy_SE {
+static FUNC_PREFIX void d3q19storagespecification_localCopy_SE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+
+
+
+/*************************************************************************************
+ *                                 Kernel Wrappers
+*************************************************************************************/
+
+namespace walberla {
+namespace lbm {
+
+   void D3Q19StorageSpecification::PackKernels::packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const
+   {
+      double * buffer = reinterpret_cast<double*>(outBuffer);
+      double * RESTRICT  _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      internal_d3q19storagespecification_pack_ALL::d3q19storagespecification_pack_ALL(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+   }
+
+
+   void D3Q19StorageSpecification::PackKernels::unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const
+   {
+      double * buffer = reinterpret_cast<double*>(inBuffer);
+      double * RESTRICT const _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      internal_d3q19storagespecification_unpack_ALL::d3q19storagespecification_unpack_ALL(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+   }
+
+
+   void D3Q19StorageSpecification::PackKernels::localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      internal_d3q19storagespecification_localCopy_ALL::d3q19storagespecification_localCopy_ALL(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+   }
+
+   void D3Q19StorageSpecification::PackKernels::packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const
+   {
+      double * buffer = reinterpret_cast<double*>(outBuffer);
+      double * RESTRICT  _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q19storagespecification_pack_N::d3q19storagespecification_pack_N(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q19storagespecification_pack_S::d3q19storagespecification_pack_S(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q19storagespecification_pack_W::d3q19storagespecification_pack_W(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q19storagespecification_pack_E::d3q19storagespecification_pack_E(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q19storagespecification_pack_T::d3q19storagespecification_pack_T(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q19storagespecification_pack_B::d3q19storagespecification_pack_B(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q19storagespecification_pack_NW::d3q19storagespecification_pack_NW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q19storagespecification_pack_NE::d3q19storagespecification_pack_NE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q19storagespecification_pack_SW::d3q19storagespecification_pack_SW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q19storagespecification_pack_SE::d3q19storagespecification_pack_SE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q19storagespecification_pack_TN::d3q19storagespecification_pack_TN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q19storagespecification_pack_TS::d3q19storagespecification_pack_TS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q19storagespecification_pack_TW::d3q19storagespecification_pack_TW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q19storagespecification_pack_TE::d3q19storagespecification_pack_TE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q19storagespecification_pack_BN::d3q19storagespecification_pack_BN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q19storagespecification_pack_BS::d3q19storagespecification_pack_BS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q19storagespecification_pack_BW::d3q19storagespecification_pack_BW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q19storagespecification_pack_BE::d3q19storagespecification_pack_BE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   void D3Q19StorageSpecification::PackKernels::unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const
+   {
+      double * buffer = reinterpret_cast<double*>(inBuffer);
+      double * RESTRICT const _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q19storagespecification_unpack_N::d3q19storagespecification_unpack_N(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q19storagespecification_unpack_S::d3q19storagespecification_unpack_S(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q19storagespecification_unpack_W::d3q19storagespecification_unpack_W(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q19storagespecification_unpack_E::d3q19storagespecification_unpack_E(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q19storagespecification_unpack_T::d3q19storagespecification_unpack_T(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q19storagespecification_unpack_B::d3q19storagespecification_unpack_B(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q19storagespecification_unpack_NW::d3q19storagespecification_unpack_NW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q19storagespecification_unpack_NE::d3q19storagespecification_unpack_NE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q19storagespecification_unpack_SW::d3q19storagespecification_unpack_SW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q19storagespecification_unpack_SE::d3q19storagespecification_unpack_SE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q19storagespecification_unpack_TN::d3q19storagespecification_unpack_TN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q19storagespecification_unpack_TS::d3q19storagespecification_unpack_TS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q19storagespecification_unpack_TW::d3q19storagespecification_unpack_TW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q19storagespecification_unpack_TE::d3q19storagespecification_unpack_TE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q19storagespecification_unpack_BN::d3q19storagespecification_unpack_BN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q19storagespecification_unpack_BS::d3q19storagespecification_unpack_BS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q19storagespecification_unpack_BW::d3q19storagespecification_unpack_BW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q19storagespecification_unpack_BE::d3q19storagespecification_unpack_BE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   void D3Q19StorageSpecification::PackKernels::localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q19storagespecification_localCopy_N::d3q19storagespecification_localCopy_N(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q19storagespecification_localCopy_S::d3q19storagespecification_localCopy_S(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q19storagespecification_localCopy_W::d3q19storagespecification_localCopy_W(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q19storagespecification_localCopy_E::d3q19storagespecification_localCopy_E(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q19storagespecification_localCopy_T::d3q19storagespecification_localCopy_T(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q19storagespecification_localCopy_B::d3q19storagespecification_localCopy_B(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q19storagespecification_localCopy_NW::d3q19storagespecification_localCopy_NW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q19storagespecification_localCopy_NE::d3q19storagespecification_localCopy_NE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q19storagespecification_localCopy_SW::d3q19storagespecification_localCopy_SW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q19storagespecification_localCopy_SE::d3q19storagespecification_localCopy_SE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q19storagespecification_localCopy_TN::d3q19storagespecification_localCopy_TN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q19storagespecification_localCopy_TS::d3q19storagespecification_localCopy_TS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q19storagespecification_localCopy_TW::d3q19storagespecification_localCopy_TW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q19storagespecification_localCopy_TE::d3q19storagespecification_localCopy_TE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q19storagespecification_localCopy_BN::d3q19storagespecification_localCopy_BN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q19storagespecification_localCopy_BS::d3q19storagespecification_localCopy_BS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q19storagespecification_localCopy_BW::d3q19storagespecification_localCopy_BW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q19storagespecification_localCopy_BE::d3q19storagespecification_localCopy_BE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   
+}  // namespace lbm
+}  // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c2fb9e85eb2388361ad3737bb2cc64bcc075aea
--- /dev/null
+++ b/src/lbm_generated/storage_specification/D3Q19StorageSpecification.h
@@ -0,0 +1,148 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q19StorageSpecification.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/mpi/SendBuffer.h"
+#include "core/mpi/RecvBuffer.h"
+
+#include "domain_decomposition/IBlock.h"
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q19.h"
+#include "stencil/Directions.h"
+
+#define FUNC_PREFIX
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla
+{
+namespace lbm{
+
+class D3Q19StorageSpecification
+{
+ public:
+   // Used lattice stencil
+   using Stencil = stencil::D3Q19;
+   // Lattice stencil used for the communication (should be used to define which block directions need to be communicated)
+   using CommunicationStencil = stencil::D3Q19;
+   // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997
+   static const bool compressible = false;
+   // Cut off for the lattice Boltzmann equilibrium
+   static const int equilibriumAccuracyOrder = 2;
+   // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho"
+   static const bool equilibriumDeviationOnly = true;
+   // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push)
+   static const bool inplace = false;
+   // If true the background deviation (rho_0 = 1) is subtracted for the collision step.
+   static const bool zeroCenteredPDFs = true;
+   // Lattice weights
+   static constexpr double w[19] = { 0.333333333333333,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0555555555555556,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778,0.0277777777777778 };
+   // Inverse lattice weights
+   static constexpr double wInv[19] = { 3.00000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,18.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000,36.0000000000000 };
+
+   // Compute kernels to pack and unpack MPI buffers
+   class PackKernels {
+
+    public:
+      using PdfField_T = field::GhostLayerField<double, 19>;
+      using value_type = typename PdfField_T::value_type;
+
+      
+
+      static const bool inplace = false;
+
+      /**
+       * Packs all pdfs from the given cell interval to the send buffer.
+       * */
+      void packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const;
+
+      /**
+       * Unpacks all pdfs from the send buffer to the given cell interval.
+       * */
+      void unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const;
+
+      /**
+       * Copies data between two blocks on the same process.
+       * All pdfs from the sending interval are copied onto the receiving interval.
+       * */
+      void localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const;
+
+      /**
+       * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed.
+       * */
+      void packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const;
+
+      /**
+       * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked.
+       * */
+      void unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const;
+
+      /** Copies data between two blocks on the same process.
+        * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval.
+        * */
+      void localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const;
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packDirection / unpackDirection
+       * @param ci  The cell interval
+       * @param dir The communication direction
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci, stencil::Direction dir) const {
+         return ci.numCells() * sizes[dir] * sizeof(value_type);
+      }
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packAll / unpackAll
+       * @param ci  The cell interval
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci) const {
+         return ci.numCells() * 19 * sizeof(value_type);
+      }
+
+      
+
+    private:
+      const uint_t sizes[27] { 0, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+   };
+
+};
+
+}} //lbm/walberla
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ecdf88928bf8292254465e0f4ec19d4a1106373
--- /dev/null
+++ b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.cpp
@@ -0,0 +1,3099 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q27StorageSpecification.cpp
+//! \\author lbmpy
+//======================================================================================================================
+
+#include "D3Q27StorageSpecification.h"
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+/*************************************************************************************
+ *                                Kernel Definitions
+*************************************************************************************/
+namespace internal_d3q27storagespecification_pack_ALL {
+static FUNC_PREFIX void d3q27storagespecification_pack_ALL(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 1] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 3] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 4] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 5] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 6] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 7] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 8] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 9] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 10] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 11] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 12] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 13] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 14] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 15] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 16] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 17] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 18] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 19] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 20] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 21] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 22] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 23] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 24] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 25] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[27*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 27*_size_pdfs_src_2*ctr_1 + 27*ctr_2 + 26] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_ALL {
+static FUNC_PREFIX void d3q27storagespecification_unpack_ALL(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0;
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30;
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2];
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 1];
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 2];
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 3];
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 4];
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 5];
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 6];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 7];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 8];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 9];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 10];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 11];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 12];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 13];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 14];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 15];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 16];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 17];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 18];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 19];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 20];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 21];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 22];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 23];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 24];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 25];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[27*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 27*_size_pdfs_dst_2*ctr_1 + 27*ctr_2 + 26];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_ALL {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_ALL(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_30 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0;
+      double * RESTRICT _data_pdfs_src_00_30 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0;
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_30_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_30;
+         double * RESTRICT _data_pdfs_src_00_30_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_30;
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_30_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_30_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27storagespecification_pack_T {
+static FUNC_PREFIX void d3q27storagespecification_pack_T(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BN {
+static FUNC_PREFIX void d3q27storagespecification_pack_BN(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_NE {
+static FUNC_PREFIX void d3q27storagespecification_pack_NE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BNE {
+static FUNC_PREFIX void d3q27storagespecification_pack_BNE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_SE {
+static FUNC_PREFIX void d3q27storagespecification_pack_SE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TNW {
+static FUNC_PREFIX void d3q27storagespecification_pack_TNW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_W {
+static FUNC_PREFIX void d3q27storagespecification_pack_W(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TE {
+static FUNC_PREFIX void d3q27storagespecification_pack_TE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_N {
+static FUNC_PREFIX void d3q27storagespecification_pack_N(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BSW {
+static FUNC_PREFIX void d3q27storagespecification_pack_BSW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TSW {
+static FUNC_PREFIX void d3q27storagespecification_pack_TSW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BE {
+static FUNC_PREFIX void d3q27storagespecification_pack_BE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_B {
+static FUNC_PREFIX void d3q27storagespecification_pack_B(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TNE {
+static FUNC_PREFIX void d3q27storagespecification_pack_TNE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TS {
+static FUNC_PREFIX void d3q27storagespecification_pack_TS(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TN {
+static FUNC_PREFIX void d3q27storagespecification_pack_TN(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BNW {
+static FUNC_PREFIX void d3q27storagespecification_pack_BNW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TW {
+static FUNC_PREFIX void d3q27storagespecification_pack_TW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BSE {
+static FUNC_PREFIX void d3q27storagespecification_pack_BSE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_NW {
+static FUNC_PREFIX void d3q27storagespecification_pack_NW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_S {
+static FUNC_PREFIX void d3q27storagespecification_pack_S(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BS {
+static FUNC_PREFIX void d3q27storagespecification_pack_BS(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_TSE {
+static FUNC_PREFIX void d3q27storagespecification_pack_TSE(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + _size_pdfs_src_2*ctr_1 + ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_SW {
+static FUNC_PREFIX void d3q27storagespecification_pack_SW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_BW {
+static FUNC_PREFIX void d3q27storagespecification_pack_BW(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 1] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[3*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 3*_size_pdfs_src_2*ctr_1 + 3*ctr_2 + 2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_pack_E {
+static FUNC_PREFIX void d3q27storagespecification_pack_E(double * RESTRICT  _data_buffer, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_src_0, int64_t const _size_pdfs_src_1, int64_t const _size_pdfs_src_2, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_src_0; ctr_0 += 1)
+   {
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_src_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_src_2; ctr_2 += 1)
+         {
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 1] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 3] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 4] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 5] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 6] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 7] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_buffer[9*_size_pdfs_src_1*_size_pdfs_src_2*ctr_0 + 9*_size_pdfs_src_2*ctr_1 + 9*ctr_2 + 8] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TSE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TSE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_T {
+static FUNC_PREFIX void d3q27storagespecification_unpack_T(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TN {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TN(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_SW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_SW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TNE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TNE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BN {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BN(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_W {
+static FUNC_PREFIX void d3q27storagespecification_unpack_W(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_E {
+static FUNC_PREFIX void d3q27storagespecification_unpack_E(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BNE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BNE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TNW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TNW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BSE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BSE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BSW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BSW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_SE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_SE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_N {
+static FUNC_PREFIX void d3q27storagespecification_unpack_N(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_NE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_NE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_B {
+static FUNC_PREFIX void d3q27storagespecification_unpack_B(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_NW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_NW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_S {
+static FUNC_PREFIX void d3q27storagespecification_unpack_S(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 1];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 3];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 4];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 5];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 6];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 7];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[9*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 9*_size_pdfs_dst_2*ctr_1 + 9*ctr_2 + 8];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TSW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TSW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BE {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BE(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BS {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BS(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TS {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TS(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_BNW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_BNW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + _size_pdfs_dst_2*ctr_1 + ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_unpack_TW {
+static FUNC_PREFIX void d3q27storagespecification_unpack_TW(const double * RESTRICT const _data_buffer, double * RESTRICT  _data_pdfs_dst, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 1];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_buffer[3*_size_pdfs_dst_1*_size_pdfs_dst_2*ctr_0 + 3*_size_pdfs_dst_2*ctr_1 + 3*ctr_2 + 2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_SE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_SE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TS {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TS(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BNW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BNW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TSW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TSW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TNE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TNE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BS {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BS(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_W {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_W(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_33 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 3*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_33 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 3*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_33_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_33;
+         double * RESTRICT _data_pdfs_src_00_33_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_33;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_33_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_33_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TSE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TSE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_NE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_NE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_B {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_B(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_36 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 6*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_36 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 6*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_36_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_36;
+         double * RESTRICT _data_pdfs_src_00_36_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_36;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_36_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_36_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TNW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TNW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_NW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_NW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BN {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BN(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_317 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 17*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_317 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 17*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_317_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_317;
+         double * RESTRICT _data_pdfs_src_00_317_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_317;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_317_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_317_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_SW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_SW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_T {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_T(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_35 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 5*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_35 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 5*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_313 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 13*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_313 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 13*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_35_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_35;
+         double * RESTRICT _data_pdfs_src_00_35_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_35;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_313_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_313;
+         double * RESTRICT _data_pdfs_src_00_313_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_313;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_35_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_35_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_313_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_313_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BSW {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BSW(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_S {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_S(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_32 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 2*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_32 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 2*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_39 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 9*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_39 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 9*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_312 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 12*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_312 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 12*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_316 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 16*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_316 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 16*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_322 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 22*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_322 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 22*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_326 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 26*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_326 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 26*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_32_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_32;
+         double * RESTRICT _data_pdfs_src_00_32_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_32;
+         double * RESTRICT  _data_pdfs_dst_00_39_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_39;
+         double * RESTRICT _data_pdfs_src_00_39_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_39;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_312_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_312;
+         double * RESTRICT _data_pdfs_src_00_312_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_312;
+         double * RESTRICT  _data_pdfs_dst_00_316_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_316;
+         double * RESTRICT _data_pdfs_src_00_316_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_316;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_322_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_322;
+         double * RESTRICT _data_pdfs_src_00_322_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_322;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         double * RESTRICT  _data_pdfs_dst_00_326_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_326;
+         double * RESTRICT _data_pdfs_src_00_326_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_326;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_32_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_32_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_39_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_39_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_312_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_312_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_316_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_316_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_322_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_322_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_326_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_326_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_TN {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_TN(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_E {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_E(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_34 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 4*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_34 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 4*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_310 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 10*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_310 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 10*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_314 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 14*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_314 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 14*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_321 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 21*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_321 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 21*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_34_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_34;
+         double * RESTRICT _data_pdfs_src_00_34_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_34;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_310_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_310;
+         double * RESTRICT _data_pdfs_src_00_310_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_310;
+         double * RESTRICT  _data_pdfs_dst_00_314_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_314;
+         double * RESTRICT _data_pdfs_src_00_314_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_314;
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_321_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_321;
+         double * RESTRICT _data_pdfs_src_00_321_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_321;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_34_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_34_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_310_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_310_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_314_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_314_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_321_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_321_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_N {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_N(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_31 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + _stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_31 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + _stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_37 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 7*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_37 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 7*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_38 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 8*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_38 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 8*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_311 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 11*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_311 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 11*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_315 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 15*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_315 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 15*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_319 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 19*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_319 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 19*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_320 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 20*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_320 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 20*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_324 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 24*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_324 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 24*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_31_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_31;
+         double * RESTRICT _data_pdfs_src_00_31_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_31;
+         double * RESTRICT  _data_pdfs_dst_00_37_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_37;
+         double * RESTRICT _data_pdfs_src_00_37_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_37;
+         double * RESTRICT  _data_pdfs_dst_00_38_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_38;
+         double * RESTRICT _data_pdfs_src_00_38_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_38;
+         double * RESTRICT  _data_pdfs_dst_00_311_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_311;
+         double * RESTRICT _data_pdfs_src_00_311_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_311;
+         double * RESTRICT  _data_pdfs_dst_00_315_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_315;
+         double * RESTRICT _data_pdfs_src_00_315_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_315;
+         double * RESTRICT  _data_pdfs_dst_00_319_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_319;
+         double * RESTRICT _data_pdfs_src_00_319_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_319;
+         double * RESTRICT  _data_pdfs_dst_00_320_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_320;
+         double * RESTRICT _data_pdfs_src_00_320_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_320;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_324_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_324;
+         double * RESTRICT _data_pdfs_src_00_324_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_324;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_31_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_31_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_37_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_37_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_38_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_38_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_311_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_311_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_315_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_315_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_319_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_319_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_320_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_320_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_324_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_324_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BSE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BSE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_318 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 18*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_318 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 18*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      double * RESTRICT  _data_pdfs_dst_00_325 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 25*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_325 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 25*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_318_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_318;
+         double * RESTRICT _data_pdfs_src_00_318_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_318;
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         double * RESTRICT  _data_pdfs_dst_00_325_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_325;
+         double * RESTRICT _data_pdfs_src_00_325_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_325;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_318_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_318_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+            _data_pdfs_dst_00_325_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_325_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+namespace internal_d3q27storagespecification_localCopy_BNE {
+static FUNC_PREFIX void d3q27storagespecification_localCopy_BNE(double * RESTRICT  _data_pdfs_dst, double * RESTRICT const _data_pdfs_src, int64_t const _size_pdfs_dst_0, int64_t const _size_pdfs_dst_1, int64_t const _size_pdfs_dst_2, int64_t const _stride_pdfs_dst_0, int64_t const _stride_pdfs_dst_1, int64_t const _stride_pdfs_dst_2, int64_t const _stride_pdfs_dst_3, int64_t const _stride_pdfs_src_0, int64_t const _stride_pdfs_src_1, int64_t const _stride_pdfs_src_2, int64_t const _stride_pdfs_src_3)
+{
+   for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_dst_0; ctr_0 += 1)
+   {
+      double * RESTRICT  _data_pdfs_dst_00_323 = _data_pdfs_dst + _stride_pdfs_dst_0*ctr_0 + 23*_stride_pdfs_dst_3;
+      double * RESTRICT _data_pdfs_src_00_323 = _data_pdfs_src + _stride_pdfs_src_0*ctr_0 + 23*_stride_pdfs_src_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_dst_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_dst_00_323_10 = _stride_pdfs_dst_1*ctr_1 + _data_pdfs_dst_00_323;
+         double * RESTRICT _data_pdfs_src_00_323_10 = _stride_pdfs_src_1*ctr_1 + _data_pdfs_src_00_323;
+         for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_dst_2; ctr_2 += 1)
+         {
+            _data_pdfs_dst_00_323_10[_stride_pdfs_dst_2*ctr_2] = _data_pdfs_src_00_323_10[_stride_pdfs_src_2*ctr_2];
+         }
+      }
+   }
+}
+}
+
+
+
+
+/*************************************************************************************
+ *                                 Kernel Wrappers
+*************************************************************************************/
+
+namespace walberla {
+namespace lbm {
+
+   void D3Q27StorageSpecification::PackKernels::packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const
+   {
+      double * buffer = reinterpret_cast<double*>(outBuffer);
+      double * RESTRICT  _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      internal_d3q27storagespecification_pack_ALL::d3q27storagespecification_pack_ALL(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+   }
+
+
+   void D3Q27StorageSpecification::PackKernels::unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const
+   {
+      double * buffer = reinterpret_cast<double*>(inBuffer);
+      double * RESTRICT const _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      internal_d3q27storagespecification_unpack_ALL::d3q27storagespecification_unpack_ALL(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+   }
+
+
+   void D3Q27StorageSpecification::PackKernels::localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      internal_d3q27storagespecification_localCopy_ALL::d3q27storagespecification_localCopy_ALL(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+   }
+
+   void D3Q27StorageSpecification::PackKernels::packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const
+   {
+      double * buffer = reinterpret_cast<double*>(outBuffer);
+      double * RESTRICT  _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_src_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_src_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_src->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_src_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q27storagespecification_pack_N::d3q27storagespecification_pack_N(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q27storagespecification_pack_S::d3q27storagespecification_pack_S(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q27storagespecification_pack_W::d3q27storagespecification_pack_W(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q27storagespecification_pack_E::d3q27storagespecification_pack_E(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q27storagespecification_pack_T::d3q27storagespecification_pack_T(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q27storagespecification_pack_B::d3q27storagespecification_pack_B(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q27storagespecification_pack_NW::d3q27storagespecification_pack_NW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q27storagespecification_pack_NE::d3q27storagespecification_pack_NE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q27storagespecification_pack_SW::d3q27storagespecification_pack_SW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q27storagespecification_pack_SE::d3q27storagespecification_pack_SE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q27storagespecification_pack_TN::d3q27storagespecification_pack_TN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q27storagespecification_pack_TS::d3q27storagespecification_pack_TS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q27storagespecification_pack_TW::d3q27storagespecification_pack_TW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q27storagespecification_pack_TE::d3q27storagespecification_pack_TE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q27storagespecification_pack_BN::d3q27storagespecification_pack_BN(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q27storagespecification_pack_BS::d3q27storagespecification_pack_BS(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q27storagespecification_pack_BW::d3q27storagespecification_pack_BW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q27storagespecification_pack_BE::d3q27storagespecification_pack_BE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TNE : {
+              internal_d3q27storagespecification_pack_TNE::d3q27storagespecification_pack_TNE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TNW : {
+              internal_d3q27storagespecification_pack_TNW::d3q27storagespecification_pack_TNW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TSE : {
+              internal_d3q27storagespecification_pack_TSE::d3q27storagespecification_pack_TSE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TSW : {
+              internal_d3q27storagespecification_pack_TSW::d3q27storagespecification_pack_TSW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BNE : {
+              internal_d3q27storagespecification_pack_BNE::d3q27storagespecification_pack_BNE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BNW : {
+              internal_d3q27storagespecification_pack_BNW::d3q27storagespecification_pack_BNW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BSE : {
+              internal_d3q27storagespecification_pack_BSE::d3q27storagespecification_pack_BSE(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BSW : {
+              internal_d3q27storagespecification_pack_BSW::d3q27storagespecification_pack_BSW(_data_buffer, _data_pdfs_src, _size_pdfs_src_0, _size_pdfs_src_1, _size_pdfs_src_2, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   void D3Q27StorageSpecification::PackKernels::unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const
+   {
+      double * buffer = reinterpret_cast<double*>(inBuffer);
+      double * RESTRICT const _data_buffer = buffer;
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(ci.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(ci.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(ci.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q27storagespecification_unpack_N::d3q27storagespecification_unpack_N(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q27storagespecification_unpack_S::d3q27storagespecification_unpack_S(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q27storagespecification_unpack_W::d3q27storagespecification_unpack_W(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q27storagespecification_unpack_E::d3q27storagespecification_unpack_E(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q27storagespecification_unpack_T::d3q27storagespecification_unpack_T(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q27storagespecification_unpack_B::d3q27storagespecification_unpack_B(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q27storagespecification_unpack_NW::d3q27storagespecification_unpack_NW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q27storagespecification_unpack_NE::d3q27storagespecification_unpack_NE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q27storagespecification_unpack_SW::d3q27storagespecification_unpack_SW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q27storagespecification_unpack_SE::d3q27storagespecification_unpack_SE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q27storagespecification_unpack_TN::d3q27storagespecification_unpack_TN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q27storagespecification_unpack_TS::d3q27storagespecification_unpack_TS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q27storagespecification_unpack_TW::d3q27storagespecification_unpack_TW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q27storagespecification_unpack_TE::d3q27storagespecification_unpack_TE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q27storagespecification_unpack_BN::d3q27storagespecification_unpack_BN(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q27storagespecification_unpack_BS::d3q27storagespecification_unpack_BS(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q27storagespecification_unpack_BW::d3q27storagespecification_unpack_BW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q27storagespecification_unpack_BE::d3q27storagespecification_unpack_BE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TNE : {
+              internal_d3q27storagespecification_unpack_TNE::d3q27storagespecification_unpack_TNE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TNW : {
+              internal_d3q27storagespecification_unpack_TNW::d3q27storagespecification_unpack_TNW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TSE : {
+              internal_d3q27storagespecification_unpack_TSE::d3q27storagespecification_unpack_TSE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::TSW : {
+              internal_d3q27storagespecification_unpack_TSW::d3q27storagespecification_unpack_TSW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BNE : {
+              internal_d3q27storagespecification_unpack_BNE::d3q27storagespecification_unpack_BNE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BNW : {
+              internal_d3q27storagespecification_unpack_BNW::d3q27storagespecification_unpack_BNW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BSE : {
+              internal_d3q27storagespecification_unpack_BSE::d3q27storagespecification_unpack_BSE(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }
+          case stencil::BSW : {
+              internal_d3q27storagespecification_unpack_BSW::d3q27storagespecification_unpack_BSW(_data_buffer, _data_pdfs_dst, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   void D3Q27StorageSpecification::PackKernels::localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const
+   {
+      WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
+      WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
+
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.xMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.yMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(dstInterval.zMin(), -int_c(pdfs_dst->nrOfGhostLayers()))
+      double * RESTRICT  _data_pdfs_dst = pdfs_dst->dataAt(dstInterval.xMin(), dstInterval.yMin(), dstInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.xMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.yMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      WALBERLA_ASSERT_GREATER_EQUAL(srcInterval.zMin(), -int_c(pdfs_src->nrOfGhostLayers()))
+      double * RESTRICT const _data_pdfs_src = pdfs_src->dataAt(srcInterval.xMin(), srcInterval.yMin(), srcInterval.zMin(), 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->xSizeWithGhostLayer(), int64_t(int64_c(dstInterval.xSize()) + 0))
+      const int64_t _size_pdfs_dst_0 = int64_t(int64_c(dstInterval.xSize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->ySizeWithGhostLayer(), int64_t(int64_c(dstInterval.ySize()) + 0))
+      const int64_t _size_pdfs_dst_1 = int64_t(int64_c(dstInterval.ySize()) + 0);
+      WALBERLA_ASSERT_GREATER_EQUAL(pdfs_dst->zSizeWithGhostLayer(), int64_t(int64_c(dstInterval.zSize()) + 0))
+      const int64_t _size_pdfs_dst_2 = int64_t(int64_c(dstInterval.zSize()) + 0);
+      const int64_t _stride_pdfs_dst_0 = int64_t(pdfs_dst->xStride());
+      const int64_t _stride_pdfs_dst_1 = int64_t(pdfs_dst->yStride());
+      const int64_t _stride_pdfs_dst_2 = int64_t(pdfs_dst->zStride());
+      const int64_t _stride_pdfs_dst_3 = int64_t(1 * int64_t(pdfs_dst->fStride()));
+      const int64_t _stride_pdfs_src_0 = int64_t(pdfs_src->xStride());
+      const int64_t _stride_pdfs_src_1 = int64_t(pdfs_src->yStride());
+      const int64_t _stride_pdfs_src_2 = int64_t(pdfs_src->zStride());
+      const int64_t _stride_pdfs_src_3 = int64_t(1 * int64_t(pdfs_src->fStride()));
+      switch (dir) {
+          case stencil::N : {
+              internal_d3q27storagespecification_localCopy_N::d3q27storagespecification_localCopy_N(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::S : {
+              internal_d3q27storagespecification_localCopy_S::d3q27storagespecification_localCopy_S(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::W : {
+              internal_d3q27storagespecification_localCopy_W::d3q27storagespecification_localCopy_W(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::E : {
+              internal_d3q27storagespecification_localCopy_E::d3q27storagespecification_localCopy_E(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::T : {
+              internal_d3q27storagespecification_localCopy_T::d3q27storagespecification_localCopy_T(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::B : {
+              internal_d3q27storagespecification_localCopy_B::d3q27storagespecification_localCopy_B(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NW : {
+              internal_d3q27storagespecification_localCopy_NW::d3q27storagespecification_localCopy_NW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::NE : {
+              internal_d3q27storagespecification_localCopy_NE::d3q27storagespecification_localCopy_NE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SW : {
+              internal_d3q27storagespecification_localCopy_SW::d3q27storagespecification_localCopy_SW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::SE : {
+              internal_d3q27storagespecification_localCopy_SE::d3q27storagespecification_localCopy_SE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TN : {
+              internal_d3q27storagespecification_localCopy_TN::d3q27storagespecification_localCopy_TN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TS : {
+              internal_d3q27storagespecification_localCopy_TS::d3q27storagespecification_localCopy_TS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TW : {
+              internal_d3q27storagespecification_localCopy_TW::d3q27storagespecification_localCopy_TW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TE : {
+              internal_d3q27storagespecification_localCopy_TE::d3q27storagespecification_localCopy_TE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BN : {
+              internal_d3q27storagespecification_localCopy_BN::d3q27storagespecification_localCopy_BN(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BS : {
+              internal_d3q27storagespecification_localCopy_BS::d3q27storagespecification_localCopy_BS(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BW : {
+              internal_d3q27storagespecification_localCopy_BW::d3q27storagespecification_localCopy_BW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BE : {
+              internal_d3q27storagespecification_localCopy_BE::d3q27storagespecification_localCopy_BE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TNE : {
+              internal_d3q27storagespecification_localCopy_TNE::d3q27storagespecification_localCopy_TNE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TNW : {
+              internal_d3q27storagespecification_localCopy_TNW::d3q27storagespecification_localCopy_TNW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TSE : {
+              internal_d3q27storagespecification_localCopy_TSE::d3q27storagespecification_localCopy_TSE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::TSW : {
+              internal_d3q27storagespecification_localCopy_TSW::d3q27storagespecification_localCopy_TSW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BNE : {
+              internal_d3q27storagespecification_localCopy_BNE::d3q27storagespecification_localCopy_BNE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BNW : {
+              internal_d3q27storagespecification_localCopy_BNW::d3q27storagespecification_localCopy_BNW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BSE : {
+              internal_d3q27storagespecification_localCopy_BSE::d3q27storagespecification_localCopy_BSE(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }
+          case stencil::BSW : {
+              internal_d3q27storagespecification_localCopy_BSW::d3q27storagespecification_localCopy_BSW(_data_pdfs_dst, _data_pdfs_src, _size_pdfs_dst_0, _size_pdfs_dst_1, _size_pdfs_dst_2, _stride_pdfs_dst_0, _stride_pdfs_dst_1, _stride_pdfs_dst_2, _stride_pdfs_dst_3, _stride_pdfs_src_0, _stride_pdfs_src_1, _stride_pdfs_src_2, _stride_pdfs_src_3);
+              break;
+          }default: break; 
+      }
+   }
+
+   
+}  // namespace lbm
+}  // namespace walberla
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h
new file mode 100644
index 0000000000000000000000000000000000000000..42599878544c3e4632603b7141074e9196b2153a
--- /dev/null
+++ b/src/lbm_generated/storage_specification/D3Q27StorageSpecification.h
@@ -0,0 +1,148 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q27StorageSpecification.h
+//! \\author lbmpy
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/cell/CellInterval.h"
+#include "core/mpi/SendBuffer.h"
+#include "core/mpi/RecvBuffer.h"
+
+#include "domain_decomposition/IBlock.h"
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+
+#define FUNC_PREFIX
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla
+{
+namespace lbm{
+
+class D3Q27StorageSpecification
+{
+ public:
+   // Used lattice stencil
+   using Stencil = stencil::D3Q27;
+   // Lattice stencil used for the communication (should be used to define which block directions need to be communicated)
+   using CommunicationStencil = stencil::D3Q27;
+   // If false used correction: Lattice Boltzmann Model for the Incompressible Navier–Stokes Equation, He 1997
+   static const bool compressible = false;
+   // Cut off for the lattice Boltzmann equilibrium
+   static const int equilibriumAccuracyOrder = 2;
+   // If true the equilibrium is computed in regard to "delta_rho" and not the actual density "rho"
+   static const bool equilibriumDeviationOnly = true;
+   // If streaming pattern is inplace (esotwist, aa, ...) or not (pull, push)
+   static const bool inplace = false;
+   // If true the background deviation (rho_0 = 1) is subtracted for the collision step.
+   static const bool zeroCenteredPDFs = true;
+   // Lattice weights
+   static constexpr double w[27] = { 0.296296296296296,0.0740740740740741,0.0740740740740741,0.0740740740740741,0.0740740740740741,0.0740740740740741,0.0740740740740741,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.0185185185185185,0.00462962962962963,0.00462962962962963,0.00462962962962963,0.00462962962962963,0.00462962962962963,0.00462962962962963,0.00462962962962963,0.00462962962962963 };
+   // Inverse lattice weights
+   static constexpr double wInv[27] = { 3.37500000000000,13.5000000000000,13.5000000000000,13.5000000000000,13.5000000000000,13.5000000000000,13.5000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,54.0000000000000,216.000000000000,216.000000000000,216.000000000000,216.000000000000,216.000000000000,216.000000000000,216.000000000000,216.000000000000 };
+
+   // Compute kernels to pack and unpack MPI buffers
+   class PackKernels {
+
+    public:
+      using PdfField_T = field::GhostLayerField<double, 27>;
+      using value_type = typename PdfField_T::value_type;
+
+      
+
+      static const bool inplace = false;
+
+      /**
+       * Packs all pdfs from the given cell interval to the send buffer.
+       * */
+      void packAll(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer) const;
+
+      /**
+       * Unpacks all pdfs from the send buffer to the given cell interval.
+       * */
+      void unpackAll(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer) const;
+
+      /**
+       * Copies data between two blocks on the same process.
+       * All pdfs from the sending interval are copied onto the receiving interval.
+       * */
+      void localCopyAll(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval) const;
+
+      /**
+       * Packs only those populations streaming in directions aligned with the sending direction dir from the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are packed.
+       * */
+      void packDirection(PdfField_T * pdfs_src, CellInterval & ci, unsigned char * outBuffer, stencil::Direction dir) const;
+
+      /**
+       * Unpacks only those populations streaming in directions aligned with the sending direction dir to the given cell interval.
+       * For example, in 2D, if dir == N, the pdfs streaming in directions NW, N, NE are unpacked.
+       * */
+      void unpackDirection(PdfField_T * pdfs_dst, CellInterval & ci, unsigned char * inBuffer, stencil::Direction dir) const;
+
+      /** Copies data between two blocks on the same process.
+        * PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval.
+        * */
+      void localCopyDirection(PdfField_T * pdfs_src, CellInterval & srcInterval, PdfField_T * pdfs_dst, CellInterval & dstInterval, stencil::Direction dir) const;
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packDirection / unpackDirection
+       * @param ci  The cell interval
+       * @param dir The communication direction
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci, stencil::Direction dir) const {
+         return ci.numCells() * sizes[dir] * sizeof(value_type);
+      }
+
+      /**
+       * Returns the number of bytes that will be packed from / unpacked to the cell interval
+       * when using packAll / unpackAll
+       * @param ci  The cell interval
+       * @return    The required size of the buffer, in bytes
+       * */
+      uint_t size (CellInterval & ci) const {
+         return ci.numCells() * 27 * sizeof(value_type);
+      }
+
+      
+
+    private:
+      const uint_t sizes[27] { 0, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 };
+   };
+
+};
+
+}} //lbm/walberla
\ No newline at end of file
diff --git a/src/lbm_generated/storage_specification/storage_specification_generation_script.py b/src/lbm_generated/storage_specification/storage_specification_generation_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7432ee70d6233edbd4c408199f1d89ae4fe1e6d
--- /dev/null
+++ b/src/lbm_generated/storage_specification/storage_specification_generation_script.py
@@ -0,0 +1,32 @@
+import sympy as sp
+
+from pystencils import Target
+
+from lbmpy.creationfunctions import create_lb_method
+from lbmpy import LBMConfig, Stencil, Method, LBStencil
+from pystencils_walberla import ManualCodeGenerationContext, generate_info_header
+from lbmpy_walberla.storage_specification import generate_lbm_storage_specification
+
+
+with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False,
+                                 mpi=True, double_accuracy=True, cuda=False) as ctx:
+
+    for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]:
+        target = Target.GPU if ctx.cuda else Target.CPU
+        data_type = "float64" if ctx.double_accuracy else "float32"
+
+        method = Method.SRT
+        relaxation_rate = sp.symbols("omega")
+        streaming_pattern = 'pull'
+        nonuniform = False
+
+        lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate,
+                               streaming_pattern=streaming_pattern)
+
+        lb_method = create_lb_method(lbm_config=lbm_config)
+
+        storage_spec_name = f'{stencil.name}StorageSpecification'
+        generate_lbm_storage_specification(ctx, storage_spec_name, lb_method, lbm_config,
+                                           nonuniform=nonuniform, target=target, data_type=data_type)
+
+        ctx.write_all_files()
diff --git a/src/lbm_generated/sweep_collection/CMakeLists.txt b/src/lbm_generated/sweep_collection/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..91fbfb9d64fa55c4f870875be3c58a3f65a06c98
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/CMakeLists.txt
@@ -0,0 +1,7 @@
+target_sources( lbm_generated
+        PRIVATE
+        D3Q19SRT.h
+        D3Q19SRT.cpp
+        D3Q27SRT.h
+        D3Q27SRT.cpp
+        )
\ No newline at end of file
diff --git a/src/lbm_generated/sweep_collection/D3Q19SRT.cpp b/src/lbm_generated/sweep_collection/D3Q19SRT.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2ed08360d0699e51e3e47e1906727ef739a2e17
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/D3Q19SRT.cpp
@@ -0,0 +1,1012 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q19SRT.cpp
+//! \\author pystencils
+//======================================================================================================================
+#include "D3Q19SRT.h"
+
+#define FUNC_PREFIX
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+
+namespace internal_d3q19srt_kernel_streamCollide {
+static FUNC_PREFIX void d3q19srt_kernel_streamCollide(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, double omega)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double u_0 = vel0Term - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double u_1 = vel1Term - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double u_2 = vel2Term - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double u0Mu1 = u_0 + u_1*-1.0;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + u_2*-1.0;
+            const double u0Mu2 = u_0 + u_2*-1.0;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2);
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.33333333333333331 - 1.0*_data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*0.16666666666666666 - 1.0*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*-0.16666666666666666 - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*-0.16666666666666666 - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*0.16666666666666666 - 1.0*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*0.16666666666666666 - 1.0*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*-0.16666666666666666 - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*-0.083333333333333329 - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*0.083333333333333329 - 1.0*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*-0.083333333333333329 - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*0.083333333333333329 - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*0.083333333333333329 - 1.0*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*-0.083333333333333329 - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*-0.083333333333333329 - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*0.083333333333333329 - 1.0*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*0.083333333333333329 - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*-0.083333333333333329 - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*-0.083333333333333329 - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*0.083333333333333329 - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19srt_kernel_collide {
+static FUNC_PREFIX void d3q19srt_kernel_collide(double * RESTRICT  _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT  _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT  _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT  _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * RESTRICT  _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT  _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT  _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT  _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT  _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT  _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT  _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT  _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT  _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT  _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT  _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT  _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT  _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT  _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT  _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT  _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double xi_1 = _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0];
+            const double xi_2 = _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0];
+            const double xi_3 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double xi_4 = _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0];
+            const double xi_5 = _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double xi_6 = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0];
+            const double xi_7 = _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0];
+            const double xi_8 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0];
+            const double xi_9 = _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double xi_10 = _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0];
+            const double xi_11 = _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0];
+            const double xi_12 = _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double xi_13 = _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            const double xi_14 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0];
+            const double xi_15 = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0];
+            const double xi_16 = _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0];
+            const double xi_17 = _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0];
+            const double xi_18 = _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0];
+            const double xi_19 = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0];
+            const double vel0Term = xi_15 + xi_17 + xi_2 + xi_8 + xi_9;
+            const double vel1Term = xi_1 + xi_4 + xi_5 + xi_6;
+            const double vel2Term = xi_11 + xi_13 + xi_19;
+            const double delta_rho = vel0Term + vel1Term + vel2Term + xi_10 + xi_12 + xi_14 + xi_16 + xi_18 + xi_3 + xi_7;
+            const double u_0 = vel0Term + xi_11*-1.0 + xi_12*-1.0 + xi_14*-1.0 + xi_16*-1.0 + xi_5*-1.0;
+            const double u_1 = vel1Term + xi_12*-1.0 + xi_15*-1.0 + xi_18*-1.0 + xi_19*-1.0 + xi_7*-1.0 + xi_9;
+            const double u_2 = vel2Term + xi_1*-1.0 + xi_10*-1.0 + xi_16*-1.0 + xi_17 + xi_18*-1.0 + xi_2*-1.0 + xi_6;
+            const double u0Mu1 = u_0 + u_1*-1.0;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + u_2*-1.0;
+            const double u0Mu2 = u_0 + u_2*-1.0;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = delta_rho - 1.0*(u_0*u_0) - 1.0*(u_1*u_1) - 1.0*(u_2*u_2);
+            _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.33333333333333331 + xi_3*-1.0) + xi_3;
+            _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*0.16666666666666666 + xi_4*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_4;
+            _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_1*-0.16666666666666666 + xi_7*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_7;
+            _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*-0.16666666666666666 + xi_14*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_14;
+            _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_0*0.16666666666666666 + xi_8*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_8;
+            _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*0.16666666666666666 + xi_13*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_13;
+            _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.1111111111111111 + f_eq_common*0.16666666666666666 + u_2*-0.16666666666666666 + xi_10*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_10;
+            _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*-0.083333333333333329 + xi_5*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + xi_5;
+            _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*0.083333333333333329 + xi_9*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + xi_9;
+            _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu1*-0.083333333333333329 + xi_12*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Pu1*u0Pu1)) + xi_12;
+            _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu1*0.083333333333333329 + xi_15*-1.0 + 0.041666666666666664*(u_2*u_2) + 0.125*(u0Mu1*u0Mu1)) + xi_15;
+            _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*0.083333333333333329 + xi_6*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + xi_6;
+            _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*-0.083333333333333329 + xi_19*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + xi_19;
+            _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*-0.083333333333333329 + xi_11*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + xi_11;
+            _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*0.083333333333333329 + xi_17*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + xi_17;
+            _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Mu2*0.083333333333333329 + xi_1*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Mu2*u1Mu2)) + xi_1;
+            _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u1Pu2*-0.083333333333333329 + xi_18*-1.0 + 0.041666666666666664*(u_0*u_0) + 0.125*(u1Pu2*u1Pu2)) + xi_18;
+            _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Pu2*-0.083333333333333329 + xi_16*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Pu2*u0Pu2)) + xi_16;
+            _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.041666666666666664 + u0Mu2*0.083333333333333329 + xi_2*-1.0 + 0.041666666666666664*(u_1*u_1) + 0.125*(u0Mu2*u0Mu2)) + xi_2;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19srt_kernel_stream {
+static FUNC_PREFIX void d3q19srt_kernel_stream(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0;
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1;
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2;
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3;
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4;
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5;
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6;
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7;
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8;
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9;
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10;
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11;
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12;
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13;
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14;
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15;
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16;
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17;
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19srt_kernel_streamOnlyNoAdvancement {
+static FUNC_PREFIX void d3q19srt_kernel_streamOnlyNoAdvancement(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0;
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1;
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2;
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3;
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4;
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5;
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6;
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7;
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8;
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9;
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10;
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11;
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12;
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13;
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14;
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15;
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16;
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17;
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19srt_kernel_initialise {
+static FUNC_PREFIX void d3q19srt_kernel_initialise(double * RESTRICT const _data_density, double * RESTRICT  _data_pdfs, double * RESTRICT const _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2;
+      double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2;
+      double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3;
+      double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3;
+      double * RESTRICT  _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT  _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30;
+         double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30;
+         double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31;
+         double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32;
+         double * RESTRICT  _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT  _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT  _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT  _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT  _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT  _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT  _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT  _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT  _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT  _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT  _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT  _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * RESTRICT  _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT  _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT  _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT  _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT  _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT  _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1)
+         {
+            const double rho = _data_density_20_30_10[_stride_density_0*ctr_0];
+            const double delta_rho = rho - 1.0;
+            const double u_0 = _data_velocity_20_30_10[_stride_velocity_0*ctr_0];
+            const double u_1 = _data_velocity_20_31_10[_stride_velocity_0*ctr_0];
+            const double u_2 = _data_velocity_20_32_10[_stride_velocity_0*ctr_0];
+            _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = delta_rho*0.33333333333333331 - 0.33333333333333331*(u_0*u_0) - 0.33333333333333331*(u_1*u_1) - 0.33333333333333331*(u_2*u_2);
+            _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_1*0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_1*u_1);
+            _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_1*-0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_1*u_1);
+            _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_0*-0.16666666666666666 - 0.16666666666666666*(u_1*u_1) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_0*u_0);
+            _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_0*0.16666666666666666 - 0.16666666666666666*(u_1*u_1) - 0.16666666666666666*(u_2*u_2) + 0.16666666666666666*(u_0*u_0);
+            _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_2*0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_1*u_1) + 0.16666666666666666*(u_2*u_2);
+            _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = delta_rho*0.055555555555555552 + u_2*-0.16666666666666666 - 0.16666666666666666*(u_0*u_0) - 0.16666666666666666*(u_1*u_1) + 0.16666666666666666*(u_2*u_2);
+            _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*-0.25 + u_0*-0.083333333333333329 + u_1*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1);
+            _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*0.25 + u_0*0.083333333333333329 + u_1*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1);
+            _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*0.25 + u_0*-0.083333333333333329 + u_1*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1);
+            _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_1*-0.25 + u_0*0.083333333333333329 + u_1*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_1*u_1);
+            _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*0.25 + u_1*0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*-0.25 + u_1*-0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*-0.25 + u_0*-0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*0.25 + u_0*0.083333333333333329 + u_2*0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*-0.25 + u_1*0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_1*u_2*0.25 + u_1*-0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_1*u_1) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*0.25 + u_0*-0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2);
+            _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = delta_rho*0.027777777777777776 + u_0*u_2*-0.25 + u_0*0.083333333333333329 + u_2*-0.083333333333333329 + 0.083333333333333329*(u_0*u_0) + 0.083333333333333329*(u_2*u_2);
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q19srt_kernel_getter {
+static FUNC_PREFIX void d3q19srt_kernel_getter(double * RESTRICT  _data_density, double * RESTRICT const _data_pdfs, double * RESTRICT  _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_density_20_30 = _data_density + _stride_density_2*ctr_2;
+      double * RESTRICT  _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2;
+      double * RESTRICT  _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3;
+      double * RESTRICT  _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30;
+         double * RESTRICT  _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30;
+         double * RESTRICT  _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31;
+         double * RESTRICT  _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1)
+         {
+            const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_0 = vel0Term - 1.0*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_1 = vel1Term - 1.0*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_2 = vel2Term - 1.0*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0];
+            const double rho = delta_rho + 1.0;
+            const double u_0 = momdensity_0;
+            const double u_1 = momdensity_1;
+            const double u_2 = momdensity_2;
+            _data_density_20_30_10[_stride_density_0*ctr_0] = rho;
+            _data_velocity_20_30_10[_stride_velocity_0*ctr_0] = u_0;
+            _data_velocity_20_31_10[_stride_velocity_0*ctr_0] = u_1;
+            _data_velocity_20_32_10[_stride_velocity_0*ctr_0] = u_2;
+         }
+      }
+   }
+}
+}
+
+
+
+
+
+void D3Q19SRT::streamCollide( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_streamCollide::d3q19srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega);
+}
+void D3Q19SRT::streamCollideCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_streamCollide::d3q19srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega);
+}
+
+void D3Q19SRT::collide( field::GhostLayerField<double, 19> * pdfs, double omega, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   internal_d3q19srt_kernel_collide::d3q19srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+void D3Q19SRT::collideCellInterval( field::GhostLayerField<double, 19> * pdfs, double omega, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   internal_d3q19srt_kernel_collide::d3q19srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+
+void D3Q19SRT::stream( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_stream::d3q19srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+void D3Q19SRT::streamCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_stream::d3q19srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+
+void D3Q19SRT::streamOnlyNoAdvancement( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_streamOnlyNoAdvancement::d3q19srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+void D3Q19SRT::streamOnlyNoAdvancementCellInterval( field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q19srt_kernel_streamOnlyNoAdvancement::d3q19srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+
+void D3Q19SRT::initialise( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT const _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT const _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers))
+   const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers))
+   const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers))
+   const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q19srt_kernel_initialise::d3q19srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+void D3Q19SRT::initialiseCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT const _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q19srt_kernel_initialise::d3q19srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+
+void D3Q19SRT::calculateMacroscopicParameters( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT  _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT  _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers))
+   const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers))
+   const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers))
+   const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q19srt_kernel_getter::d3q19srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+void D3Q19SRT::calculateMacroscopicParametersCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT  _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT  _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q19srt_kernel_getter::d3q19srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning pop
+#endif
\ No newline at end of file
diff --git a/src/lbm_generated/sweep_collection/D3Q19SRT.h b/src/lbm_generated/sweep_collection/D3Q19SRT.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fdb3850cb000daf544b265fa4ae3808253ddc00
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/D3Q19SRT.h
@@ -0,0 +1,1131 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q19SRT.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+#include "core/Macros.h"
+
+
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/SwapableCompare.h"
+#include "field/GhostLayerField.h"
+
+#include <set>
+#include <cmath>
+
+
+
+using namespace std::placeholders;
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class D3Q19SRT
+{
+public:
+  enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+   D3Q19SRT(const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID pdfsID_, BlockDataID densityID_, BlockDataID velocityID_, double omega, const Cell & outerWidth=Cell(1, 1, 1))
+     : blocks_(blocks), pdfsID(pdfsID_), densityID(densityID_), velocityID(velocityID_), omega_(omega), outerWidth_(outerWidth)
+   {
+      
+
+      for (auto& iBlock : *blocks)
+      {
+         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
+             int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
+             int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
+          WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+      }
+   };
+
+   
+    ~D3Q19SRT() {  
+        for(auto p: cache_pdfs_) {
+            delete p;
+        }
+     }
+
+
+   /*************************************************************************************
+   *                Internal Function Definitions with raw Pointer
+   *************************************************************************************/
+   static void streamCollide (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const cell_idx_t ghost_layers = 0);
+   static void streamCollideCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, double omega, const CellInterval & ci);
+   
+   static void collide (field::GhostLayerField<double, 19> * pdfs, double omega, const cell_idx_t ghost_layers = 0);
+   static void collideCellInterval (field::GhostLayerField<double, 19> * pdfs, double omega, const CellInterval & ci);
+   
+   static void stream (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers = 0);
+   static void streamCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci);
+   
+   static void streamOnlyNoAdvancement (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const cell_idx_t ghost_layers = 0);
+   static void streamOnlyNoAdvancementCellInterval (field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 19> * pdfs_tmp, const CellInterval & ci);
+   
+   static void initialise (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0);
+   static void initialiseCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci);
+   
+   static void calculateMacroscopicParameters (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0);
+   static void calculateMacroscopicParametersCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 19> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci);
+   
+
+   /*************************************************************************************
+   *                Function Definitions for external Usage
+   *************************************************************************************/
+
+   std::function<void (IBlock *)> streamCollide()
+   {
+      return [this](IBlock* block) { streamCollide(block); };
+   }
+
+   std::function<void (IBlock *)> streamCollide(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamCollideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamCollideOuter(block); };
+         default:
+            return [this](IBlock* block) { streamCollide(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> streamCollide(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamCollideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamCollideOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { streamCollide(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void streamCollide(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollide(pdfs, pdfs_tmp, omega, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamCollide(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollide(pdfs, pdfs_tmp, omega, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   
+
+   void streamCollideCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamCollideInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamCollideCellInterval(pdfs, pdfs_tmp, omega, inner);
+   }
+
+   void streamCollideOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci);
+      }
+    
+
+    pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+   
+
+   std::function<void (IBlock *)> collide()
+   {
+      return [this](IBlock* block) { collide(block); };
+   }
+
+   std::function<void (IBlock *)> collide(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { collideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { collideOuter(block); };
+         default:
+            return [this](IBlock* block) { collide(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> collide(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { collideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { collideOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { collide(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void collide(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collide(pdfs, omega, ghost_layers);
+      
+   }
+
+   void collide(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collide(pdfs, omega, ghost_layers);
+      
+   }
+
+   
+
+   void collideCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collideCellInterval(pdfs, omega, ci);
+      
+   }
+
+   void collideInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      collideCellInterval(pdfs, omega, inner);
+   }
+
+   void collideOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         collideCellInterval(pdfs, omega, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> stream()
+   {
+      return [this](IBlock* block) { stream(block); };
+   }
+
+   std::function<void (IBlock *)> stream(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOuter(block); };
+         default:
+            return [this](IBlock* block) { stream(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> stream(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { stream(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void stream(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      stream(pdfs, pdfs_tmp, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void stream(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      stream(pdfs, pdfs_tmp, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   
+
+   void streamCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamCellInterval(pdfs, pdfs_tmp, ci);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamCellInterval(pdfs, pdfs_tmp, inner);
+   }
+
+   void streamOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamCellInterval(pdfs, pdfs_tmp, ci);
+      }
+    
+
+    pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+   
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement()
+   {
+      return [this](IBlock* block) { streamOnlyNoAdvancement(block); };
+   }
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); };
+         default:
+            return [this](IBlock* block) { streamOnlyNoAdvancement(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { streamOnlyNoAdvancement(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void streamOnlyNoAdvancement(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers);
+      
+   }
+
+   void streamOnlyNoAdvancement(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers);
+      
+   }
+
+   
+
+   void streamOnlyNoAdvancementCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci);
+      
+   }
+
+   void streamOnlyNoAdvancementInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, inner);
+   }
+
+   void streamOnlyNoAdvancementOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      field::GhostLayerField<double, 19> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> initialise()
+   {
+      return [this](IBlock* block) { initialise(block); };
+   }
+
+   std::function<void (IBlock *)> initialise(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { initialiseInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { initialiseOuter(block); };
+         default:
+            return [this](IBlock* block) { initialise(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> initialise(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { initialiseInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { initialiseOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { initialise(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void initialise(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialise(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   void initialise(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialise(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   
+
+   void initialiseCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialiseCellInterval(density, pdfs, velocity, ci);
+      
+   }
+
+   void initialiseInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      CellInterval inner = density->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      initialiseCellInterval(density, pdfs, velocity, inner);
+   }
+
+   void initialiseOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         initialiseCellInterval(density, pdfs, velocity, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters()
+   {
+      return [this](IBlock* block) { calculateMacroscopicParameters(block); };
+   }
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { calculateMacroscopicParametersInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); };
+         default:
+            return [this](IBlock* block) { calculateMacroscopicParameters(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { calculateMacroscopicParametersInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { calculateMacroscopicParameters(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void calculateMacroscopicParameters(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   void calculateMacroscopicParameters(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   
+
+   void calculateMacroscopicParametersCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci);
+      
+   }
+
+   void calculateMacroscopicParametersInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      CellInterval inner = density->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      calculateMacroscopicParametersCellInterval(density, pdfs, velocity, inner);
+   }
+
+   void calculateMacroscopicParametersOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 19> >(pdfsID);
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci);
+      }
+    
+
+    
+   }
+   
+
+   
+
+   private:
+      shared_ptr< StructuredBlockStorage > blocks_;
+      BlockDataID pdfsID;
+    BlockDataID densityID;
+    BlockDataID velocityID;
+    double omega_;
+
+    private: std::set< field::GhostLayerField<double, 19> *, field::SwapableCompare< field::GhostLayerField<double, 19> * > > cache_pdfs_;
+
+      Cell outerWidth_;
+      std::vector<CellInterval> layers_;
+
+      
+};
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/src/lbm_generated/sweep_collection/D3Q27SRT.cpp b/src/lbm_generated/sweep_collection/D3Q27SRT.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce89749fc60ab603f3172992cb46c65242e57d16
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/D3Q27SRT.cpp
@@ -0,0 +1,1220 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q27SRT.cpp
+//! \\author pystencils
+//======================================================================================================================
+#include "D3Q27SRT.h"
+
+#define FUNC_PREFIX
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace lbm {
+
+
+namespace internal_d3q27srt_kernel_streamCollide {
+static FUNC_PREFIX void d3q27srt_kernel_streamCollide(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3, double omega)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321;
+         double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319;
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325;
+         double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         double * RESTRICT  _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319;
+         double * RESTRICT  _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320;
+         double * RESTRICT  _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321;
+         double * RESTRICT  _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322;
+         double * RESTRICT  _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323;
+         double * RESTRICT  _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324;
+         double * RESTRICT  _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325;
+         double * RESTRICT  _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double vel0Term = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double vel1Term = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double vel2Term = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double u_0 = vel0Term - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double u_1 = vel1Term - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double u_2 = vel2Term - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double u0Mu1 = u_0 + u_1*-1.0;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + u_2*-1.0;
+            const double u0Mu2 = u_0 + u_2*-1.0;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2);
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.29629629629629628 - 1.0*_data_pdfs_20_30_10[_stride_pdfs_0*ctr_0]) + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*0.22222222222222221 - 1.0*_data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*-0.22222222222222221 - 1.0*_data_pdfs_20_32_11[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_1*u_1)) + _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*-0.22222222222222221 - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*0.22222222222222221 - 1.0*_data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.33333333333333331*(u_0*u_0)) + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*0.22222222222222221 - 1.0*_data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*-0.22222222222222221 - 1.0*_data_pdfs_21_36_10[_stride_pdfs_0*ctr_0] + 0.33333333333333331*(u_2*u_2)) + _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*-0.055555555555555552 - 1.0*_data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Mu1*u0Mu1)) + _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*0.055555555555555552 - 1.0*_data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Pu1*u0Pu1)) + _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*-0.055555555555555552 - 1.0*_data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Pu1*u0Pu1)) + _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*0.055555555555555552 - 1.0*_data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Mu1*u0Mu1)) + _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*0.055555555555555552 - 1.0*_data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*-0.055555555555555552 - 1.0*_data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*-0.055555555555555552 - 1.0*_data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Mu2*u0Mu2)) + _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*0.055555555555555552 - 1.0*_data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Pu2*u0Pu2)) + _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*0.055555555555555552 - 1.0*_data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Mu2*u1Mu2)) + _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*-0.055555555555555552 - 1.0*_data_pdfs_21_316_11[_stride_pdfs_0*ctr_0] + 0.083333333333333329*(u1Pu2*u1Pu2)) + _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*-0.055555555555555552 - 1.0*_data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.083333333333333329*(u0Pu2*u0Pu2)) + _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*0.055555555555555552 - 1.0*_data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.083333333333333329*(u0Mu2*u0Mu2)) + _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888 - 1.0*_data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0] + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888 - 1.0*_data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0] + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27srt_kernel_collide {
+static FUNC_PREFIX void d3q27srt_kernel_collide(double * RESTRICT  _data_pdfs, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, double omega)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT  _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT  _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT  _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT  _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326;
+         double * RESTRICT  _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323;
+         double * RESTRICT  _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT  _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT  _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * RESTRICT  _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321;
+         double * RESTRICT  _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT  _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324;
+         double * RESTRICT  _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT  _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319;
+         double * RESTRICT  _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT  _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT  _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325;
+         double * RESTRICT  _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT  _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320;
+         double * RESTRICT  _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT  _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT  _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT  _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT  _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322;
+         double * RESTRICT  _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT  _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT  _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * RESTRICT  _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT  _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double xi_1 = _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0];
+            const double xi_2 = _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0];
+            const double xi_3 = _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0];
+            const double xi_4 = _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0];
+            const double xi_5 = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0];
+            const double xi_6 = _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0];
+            const double xi_7 = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0];
+            const double xi_8 = _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0];
+            const double xi_9 = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0];
+            const double xi_10 = _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0];
+            const double xi_11 = _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double xi_12 = _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0];
+            const double xi_13 = _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0];
+            const double xi_14 = _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double xi_15 = _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0];
+            const double xi_16 = _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0];
+            const double xi_17 = _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0];
+            const double xi_18 = _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double xi_19 = _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            const double xi_20 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double xi_21 = _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0];
+            const double xi_22 = _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0];
+            const double xi_23 = _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0];
+            const double xi_24 = _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0];
+            const double xi_25 = _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0];
+            const double xi_26 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0];
+            const double xi_27 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0];
+            const double vel0Term = xi_12 + xi_14 + xi_15 + xi_24 + xi_25 + xi_26 + xi_4 + xi_5 + xi_8;
+            const double vel1Term = xi_10 + xi_11 + xi_13 + xi_17 + xi_21 + xi_9;
+            const double vel2Term = xi_1 + xi_19 + xi_22 + xi_7;
+            const double delta_rho = vel0Term + vel1Term + vel2Term + xi_16 + xi_18 + xi_2 + xi_20 + xi_23 + xi_27 + xi_3 + xi_6;
+            const double u_0 = vel0Term + xi_1*-1.0 + xi_10*-1.0 + xi_11*-1.0 + xi_17*-1.0 + xi_18*-1.0 + xi_22*-1.0 + xi_23*-1.0 + xi_27*-1.0 + xi_3*-1.0;
+            const double u_1 = vel1Term + xi_12 + xi_14 + xi_15*-1.0 + xi_16*-1.0 + xi_18*-1.0 + xi_22*-1.0 + xi_3*-1.0 + xi_4 + xi_5*-1.0 + xi_6*-1.0 + xi_7*-1.0 + xi_8*-1.0;
+            const double u_2 = vel2Term + xi_10*-1.0 + xi_12 + xi_15*-1.0 + xi_17 + xi_2*-1.0 + xi_21*-1.0 + xi_23*-1.0 + xi_24 + xi_25*-1.0 + xi_3*-1.0 + xi_4*-1.0 + xi_6*-1.0 + xi_8 + xi_9;
+            const double u0Mu1 = u_0 + u_1*-1.0;
+            const double u0Pu1 = u_0 + u_1;
+            const double u1Pu2 = u_1 + u_2;
+            const double u1Mu2 = u_1 + u_2*-1.0;
+            const double u0Mu2 = u_0 + u_2*-1.0;
+            const double u0Pu2 = u_0 + u_2;
+            const double f_eq_common = delta_rho - 1.5*(u_0*u_0) - 1.5*(u_1*u_1) - 1.5*(u_2*u_2);
+            _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.29629629629629628 + xi_20*-1.0) + xi_20;
+            _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*0.22222222222222221 + xi_13*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_13;
+            _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_1*-0.22222222222222221 + xi_16*-1.0 + 0.33333333333333331*(u_1*u_1)) + xi_16;
+            _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*-0.22222222222222221 + xi_27*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_27;
+            _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_0*0.22222222222222221 + xi_26*-1.0 + 0.33333333333333331*(u_0*u_0)) + xi_26;
+            _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*0.22222222222222221 + xi_19*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_19;
+            _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.07407407407407407 + u_2*-0.22222222222222221 + xi_2*-1.0 + 0.33333333333333331*(u_2*u_2)) + xi_2;
+            _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*-0.055555555555555552 + xi_11*-1.0 + 0.083333333333333329*(u0Mu1*u0Mu1)) + xi_11;
+            _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*0.055555555555555552 + xi_14*-1.0 + 0.083333333333333329*(u0Pu1*u0Pu1)) + xi_14;
+            _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu1*-0.055555555555555552 + xi_18*-1.0 + 0.083333333333333329*(u0Pu1*u0Pu1)) + xi_18;
+            _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu1*0.055555555555555552 + xi_5*-1.0 + 0.083333333333333329*(u0Mu1*u0Mu1)) + xi_5;
+            _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*0.055555555555555552 + xi_9*-1.0 + 0.083333333333333329*(u1Pu2*u1Pu2)) + xi_9;
+            _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*-0.055555555555555552 + xi_7*-1.0 + 0.083333333333333329*(u1Mu2*u1Mu2)) + xi_7;
+            _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*-0.055555555555555552 + xi_1*-1.0 + 0.083333333333333329*(u0Mu2*u0Mu2)) + xi_1;
+            _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*0.055555555555555552 + xi_24*-1.0 + 0.083333333333333329*(u0Pu2*u0Pu2)) + xi_24;
+            _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Mu2*0.055555555555555552 + xi_21*-1.0 + 0.083333333333333329*(u1Mu2*u1Mu2)) + xi_21;
+            _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u1Pu2*-0.055555555555555552 + xi_6*-1.0 + 0.083333333333333329*(u1Pu2*u1Pu2)) + xi_6;
+            _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Pu2*-0.055555555555555552 + xi_23*-1.0 + 0.083333333333333329*(u0Pu2*u0Pu2)) + xi_23;
+            _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = omega*(f_eq_common*0.018518518518518517 + u0Mu2*0.055555555555555552 + xi_25*-1.0 + 0.083333333333333329*(u0Mu2*u0Mu2)) + xi_25;
+            _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*0.013888888888888888 + xi_12*-1.0 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_12;
+            _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888 + xi_17*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_17;
+            _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*0.013888888888888888 + xi_8*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_8;
+            _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888 + xi_22*-1.0 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_22;
+            _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888 + xi_4*-1.0 + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_4;
+            _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888 + xi_10*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2)) + xi_10;
+            _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888 + xi_15*-1.0 + 0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_15;
+            _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] = omega*(delta_rho*-0.013888888888888888 + f_eq_common*0.018518518518518517 + u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888 + xi_3*-1.0 + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2)) + xi_3;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27srt_kernel_stream {
+static FUNC_PREFIX void d3q27srt_kernel_stream(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319;
+         double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320;
+         double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321;
+         double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322;
+         double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323;
+         double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324;
+         double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325;
+         double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         double * RESTRICT  _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319;
+         double * RESTRICT  _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320;
+         double * RESTRICT  _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321;
+         double * RESTRICT  _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322;
+         double * RESTRICT  _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323;
+         double * RESTRICT  _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324;
+         double * RESTRICT  _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325;
+         double * RESTRICT  _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_19 = _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_20 = _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_21 = _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_22 = _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_23 = _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_24 = _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_25 = _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_26 = _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0;
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1;
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2;
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3;
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4;
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5;
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6;
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7;
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8;
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9;
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10;
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11;
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12;
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13;
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14;
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15;
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16;
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17;
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18;
+            _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = streamed_19;
+            _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = streamed_20;
+            _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = streamed_21;
+            _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = streamed_22;
+            _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = streamed_23;
+            _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = streamed_24;
+            _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = streamed_25;
+            _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = streamed_26;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27srt_kernel_streamOnlyNoAdvancement {
+static FUNC_PREFIX void d3q27srt_kernel_streamOnlyNoAdvancement(double * RESTRICT const _data_pdfs, double * RESTRICT  _data_pdfs_tmp, int64_t const _size_pdfs_0, int64_t const _size_pdfs_1, int64_t const _size_pdfs_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_pdfs_tmp_0, int64_t const _stride_pdfs_tmp_1, int64_t const _stride_pdfs_tmp_2, int64_t const _stride_pdfs_tmp_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_pdfs_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_35 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 6*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_311 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_312 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_313 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_314 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_319 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 19*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_320 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 20*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_321 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 21*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_2m1_322 = _data_pdfs + _stride_pdfs_2*ctr_2 - _stride_pdfs_2 + 22*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 23*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 24*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 25*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_21_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_2 + 26*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_tmp_20_30 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2;
+      double * RESTRICT  _data_pdfs_tmp_20_31 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + _stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_32 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 2*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_33 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 3*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_34 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 4*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_35 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 5*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_36 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 6*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_37 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 7*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_38 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 8*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_39 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 9*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_310 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 10*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_311 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 11*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_312 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 12*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_313 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 13*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_314 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 14*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_315 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 15*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_316 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 16*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_317 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 17*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_318 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 18*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_319 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 19*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_320 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 20*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_321 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 21*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_322 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 22*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_323 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 23*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_324 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 24*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_325 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 25*_stride_pdfs_tmp_3;
+      double * RESTRICT  _data_pdfs_tmp_20_326 = _data_pdfs_tmp + _stride_pdfs_tmp_2*ctr_2 + 26*_stride_pdfs_tmp_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_pdfs_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_31_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_32_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_2m1_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_35;
+         double * RESTRICT _data_pdfs_21_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_36;
+         double * RESTRICT _data_pdfs_20_37_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_38_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_39_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_310_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_2m1_311_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_311;
+         double * RESTRICT _data_pdfs_2m1_312_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_312;
+         double * RESTRICT _data_pdfs_2m1_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_313;
+         double * RESTRICT _data_pdfs_2m1_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_2m1_314;
+         double * RESTRICT _data_pdfs_21_315_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_315;
+         double * RESTRICT _data_pdfs_21_316_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_316;
+         double * RESTRICT _data_pdfs_21_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_317;
+         double * RESTRICT _data_pdfs_21_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_21_318;
+         double * RESTRICT _data_pdfs_2m1_319_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_319;
+         double * RESTRICT _data_pdfs_2m1_320_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_2m1_320;
+         double * RESTRICT _data_pdfs_2m1_321_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_321;
+         double * RESTRICT _data_pdfs_2m1_322_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_2m1_322;
+         double * RESTRICT _data_pdfs_21_323_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_323;
+         double * RESTRICT _data_pdfs_21_324_1m1 = _stride_pdfs_1*ctr_1 - _stride_pdfs_1 + _data_pdfs_21_324;
+         double * RESTRICT _data_pdfs_21_325_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_325;
+         double * RESTRICT _data_pdfs_21_326_11 = _stride_pdfs_1*ctr_1 + _stride_pdfs_1 + _data_pdfs_21_326;
+         double * RESTRICT  _data_pdfs_tmp_20_30_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_30;
+         double * RESTRICT  _data_pdfs_tmp_20_31_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_31;
+         double * RESTRICT  _data_pdfs_tmp_20_32_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_32;
+         double * RESTRICT  _data_pdfs_tmp_20_33_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_33;
+         double * RESTRICT  _data_pdfs_tmp_20_34_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_34;
+         double * RESTRICT  _data_pdfs_tmp_20_35_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_35;
+         double * RESTRICT  _data_pdfs_tmp_20_36_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_36;
+         double * RESTRICT  _data_pdfs_tmp_20_37_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_37;
+         double * RESTRICT  _data_pdfs_tmp_20_38_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_38;
+         double * RESTRICT  _data_pdfs_tmp_20_39_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_39;
+         double * RESTRICT  _data_pdfs_tmp_20_310_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_310;
+         double * RESTRICT  _data_pdfs_tmp_20_311_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_311;
+         double * RESTRICT  _data_pdfs_tmp_20_312_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_312;
+         double * RESTRICT  _data_pdfs_tmp_20_313_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_313;
+         double * RESTRICT  _data_pdfs_tmp_20_314_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_314;
+         double * RESTRICT  _data_pdfs_tmp_20_315_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_315;
+         double * RESTRICT  _data_pdfs_tmp_20_316_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_316;
+         double * RESTRICT  _data_pdfs_tmp_20_317_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_317;
+         double * RESTRICT  _data_pdfs_tmp_20_318_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_318;
+         double * RESTRICT  _data_pdfs_tmp_20_319_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_319;
+         double * RESTRICT  _data_pdfs_tmp_20_320_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_320;
+         double * RESTRICT  _data_pdfs_tmp_20_321_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_321;
+         double * RESTRICT  _data_pdfs_tmp_20_322_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_322;
+         double * RESTRICT  _data_pdfs_tmp_20_323_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_323;
+         double * RESTRICT  _data_pdfs_tmp_20_324_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_324;
+         double * RESTRICT  _data_pdfs_tmp_20_325_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_325;
+         double * RESTRICT  _data_pdfs_tmp_20_326_10 = _stride_pdfs_tmp_1*ctr_1 + _data_pdfs_tmp_20_326;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_pdfs_0; ctr_0 += 1)
+         {
+            const double streamed_0 = _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0];
+            const double streamed_1 = _data_pdfs_20_31_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_2 = _data_pdfs_20_32_11[_stride_pdfs_0*ctr_0];
+            const double streamed_3 = _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_4 = _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_5 = _data_pdfs_2m1_35_10[_stride_pdfs_0*ctr_0];
+            const double streamed_6 = _data_pdfs_21_36_10[_stride_pdfs_0*ctr_0];
+            const double streamed_7 = _data_pdfs_20_37_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_8 = _data_pdfs_20_38_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_9 = _data_pdfs_20_39_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_10 = _data_pdfs_20_310_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_11 = _data_pdfs_2m1_311_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_12 = _data_pdfs_2m1_312_11[_stride_pdfs_0*ctr_0];
+            const double streamed_13 = _data_pdfs_2m1_313_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_14 = _data_pdfs_2m1_314_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_15 = _data_pdfs_21_315_1m1[_stride_pdfs_0*ctr_0];
+            const double streamed_16 = _data_pdfs_21_316_11[_stride_pdfs_0*ctr_0];
+            const double streamed_17 = _data_pdfs_21_317_10[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_18 = _data_pdfs_21_318_10[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_19 = _data_pdfs_2m1_319_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_20 = _data_pdfs_2m1_320_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_21 = _data_pdfs_2m1_321_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_22 = _data_pdfs_2m1_322_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_23 = _data_pdfs_21_323_1m1[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_24 = _data_pdfs_21_324_1m1[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            const double streamed_25 = _data_pdfs_21_325_11[_stride_pdfs_0*ctr_0 - _stride_pdfs_0];
+            const double streamed_26 = _data_pdfs_21_326_11[_stride_pdfs_0*ctr_0 + _stride_pdfs_0];
+            _data_pdfs_tmp_20_30_10[_stride_pdfs_tmp_0*ctr_0] = streamed_0;
+            _data_pdfs_tmp_20_31_10[_stride_pdfs_tmp_0*ctr_0] = streamed_1;
+            _data_pdfs_tmp_20_32_10[_stride_pdfs_tmp_0*ctr_0] = streamed_2;
+            _data_pdfs_tmp_20_33_10[_stride_pdfs_tmp_0*ctr_0] = streamed_3;
+            _data_pdfs_tmp_20_34_10[_stride_pdfs_tmp_0*ctr_0] = streamed_4;
+            _data_pdfs_tmp_20_35_10[_stride_pdfs_tmp_0*ctr_0] = streamed_5;
+            _data_pdfs_tmp_20_36_10[_stride_pdfs_tmp_0*ctr_0] = streamed_6;
+            _data_pdfs_tmp_20_37_10[_stride_pdfs_tmp_0*ctr_0] = streamed_7;
+            _data_pdfs_tmp_20_38_10[_stride_pdfs_tmp_0*ctr_0] = streamed_8;
+            _data_pdfs_tmp_20_39_10[_stride_pdfs_tmp_0*ctr_0] = streamed_9;
+            _data_pdfs_tmp_20_310_10[_stride_pdfs_tmp_0*ctr_0] = streamed_10;
+            _data_pdfs_tmp_20_311_10[_stride_pdfs_tmp_0*ctr_0] = streamed_11;
+            _data_pdfs_tmp_20_312_10[_stride_pdfs_tmp_0*ctr_0] = streamed_12;
+            _data_pdfs_tmp_20_313_10[_stride_pdfs_tmp_0*ctr_0] = streamed_13;
+            _data_pdfs_tmp_20_314_10[_stride_pdfs_tmp_0*ctr_0] = streamed_14;
+            _data_pdfs_tmp_20_315_10[_stride_pdfs_tmp_0*ctr_0] = streamed_15;
+            _data_pdfs_tmp_20_316_10[_stride_pdfs_tmp_0*ctr_0] = streamed_16;
+            _data_pdfs_tmp_20_317_10[_stride_pdfs_tmp_0*ctr_0] = streamed_17;
+            _data_pdfs_tmp_20_318_10[_stride_pdfs_tmp_0*ctr_0] = streamed_18;
+            _data_pdfs_tmp_20_319_10[_stride_pdfs_tmp_0*ctr_0] = streamed_19;
+            _data_pdfs_tmp_20_320_10[_stride_pdfs_tmp_0*ctr_0] = streamed_20;
+            _data_pdfs_tmp_20_321_10[_stride_pdfs_tmp_0*ctr_0] = streamed_21;
+            _data_pdfs_tmp_20_322_10[_stride_pdfs_tmp_0*ctr_0] = streamed_22;
+            _data_pdfs_tmp_20_323_10[_stride_pdfs_tmp_0*ctr_0] = streamed_23;
+            _data_pdfs_tmp_20_324_10[_stride_pdfs_tmp_0*ctr_0] = streamed_24;
+            _data_pdfs_tmp_20_325_10[_stride_pdfs_tmp_0*ctr_0] = streamed_25;
+            _data_pdfs_tmp_20_326_10[_stride_pdfs_tmp_0*ctr_0] = streamed_26;
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27srt_kernel_initialise {
+static FUNC_PREFIX void d3q27srt_kernel_initialise(double * RESTRICT const _data_density, double * RESTRICT  _data_pdfs, double * RESTRICT const _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_density_20_30 = _data_density + _stride_density_2*ctr_2;
+      double * RESTRICT _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2;
+      double * RESTRICT _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3;
+      double * RESTRICT _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3;
+      double * RESTRICT  _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT  _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3;
+      double * RESTRICT  _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30;
+         double * RESTRICT _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30;
+         double * RESTRICT _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31;
+         double * RESTRICT _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32;
+         double * RESTRICT  _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT  _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT  _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT  _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT  _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT  _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT  _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT  _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT  _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT  _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT  _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT  _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * RESTRICT  _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT  _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT  _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT  _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT  _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT  _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * RESTRICT  _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319;
+         double * RESTRICT  _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320;
+         double * RESTRICT  _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321;
+         double * RESTRICT  _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322;
+         double * RESTRICT  _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323;
+         double * RESTRICT  _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324;
+         double * RESTRICT  _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325;
+         double * RESTRICT  _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1)
+         {
+            const double rho = _data_density_20_30_10[_stride_density_0*ctr_0];
+            const double delta_rho = rho - 1.0;
+            const double u_0 = _data_velocity_20_30_10[_stride_velocity_0*ctr_0];
+            const double u_1 = _data_velocity_20_31_10[_stride_velocity_0*ctr_0];
+            const double u_2 = _data_velocity_20_32_10[_stride_velocity_0*ctr_0];
+            _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] = delta_rho*0.29629629629629628 - 0.44444444444444442*(u_0*u_0) - 0.44444444444444442*(u_1*u_1) - 0.44444444444444442*(u_2*u_2);
+            _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_1*0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_1*u_1);
+            _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_1*-0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_1*u_1);
+            _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_0*-0.22222222222222221 - 0.1111111111111111*(u_1*u_1) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_0*u_0);
+            _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_0*0.22222222222222221 - 0.1111111111111111*(u_1*u_1) - 0.1111111111111111*(u_2*u_2) + 0.22222222222222221*(u_0*u_0);
+            _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_2*0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_1*u_1) + 0.22222222222222221*(u_2*u_2);
+            _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] = delta_rho*0.07407407407407407 + u_2*-0.22222222222222221 - 0.1111111111111111*(u_0*u_0) - 0.1111111111111111*(u_1*u_1) + 0.22222222222222221*(u_2*u_2);
+            _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*-0.16666666666666666 + u_0*-0.055555555555555552 + u_1*0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1);
+            _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*0.16666666666666666 + u_0*0.055555555555555552 + u_1*0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1);
+            _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*0.16666666666666666 + u_0*-0.055555555555555552 + u_1*-0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1);
+            _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_1*-0.16666666666666666 + u_0*0.055555555555555552 + u_1*-0.055555555555555552 - 0.027777777777777776*(u_2*u_2) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_1*u_1);
+            _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*0.16666666666666666 + u_1*0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*-0.16666666666666666 + u_1*-0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*-0.16666666666666666 + u_0*-0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*0.16666666666666666 + u_0*0.055555555555555552 + u_2*0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*-0.16666666666666666 + u_1*0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_1*u_2*0.16666666666666666 + u_1*-0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_0*u_0) + 0.055555555555555552*(u_1*u_1) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*0.16666666666666666 + u_0*-0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] = delta_rho*0.018518518518518517 + u_0*u_2*-0.16666666666666666 + u_0*0.055555555555555552 + u_2*-0.055555555555555552 - 0.027777777777777776*(u_1*u_1) + 0.055555555555555552*(u_0*u_0) + 0.055555555555555552*(u_2*u_2);
+            _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*-0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*-0.013888888888888888 + u_2*0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*-0.041666666666666664 + u_1*0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*-0.041666666666666664 + u_0*u_2*-0.041666666666666664 + u_0*0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*-0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+            _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] = delta_rho*0.0046296296296296294 + u_0*u_1*0.041666666666666664 + u_0*u_2*0.041666666666666664 + u_0*-0.013888888888888888 + u_1*u_2*0.041666666666666664 + u_1*-0.013888888888888888 + u_2*-0.013888888888888888 + 0.013888888888888888*(u_0*u_0) + 0.013888888888888888*(u_1*u_1) + 0.013888888888888888*(u_2*u_2);
+         }
+      }
+   }
+}
+}
+
+
+namespace internal_d3q27srt_kernel_getter {
+static FUNC_PREFIX void d3q27srt_kernel_getter(double * RESTRICT  _data_density, double * RESTRICT const _data_pdfs, double * RESTRICT  _data_velocity, int64_t const _size_density_0, int64_t const _size_density_1, int64_t const _size_density_2, int64_t const _stride_density_0, int64_t const _stride_density_1, int64_t const _stride_density_2, int64_t const _stride_pdfs_0, int64_t const _stride_pdfs_1, int64_t const _stride_pdfs_2, int64_t const _stride_pdfs_3, int64_t const _stride_velocity_0, int64_t const _stride_velocity_1, int64_t const _stride_velocity_2, int64_t const _stride_velocity_3)
+{
+   for (int64_t ctr_2 = 0; ctr_2 < _size_density_2; ctr_2 += 1)
+   {
+      double * RESTRICT _data_pdfs_20_310 = _data_pdfs + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_314 = _data_pdfs + _stride_pdfs_2*ctr_2 + 14*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_318 = _data_pdfs + _stride_pdfs_2*ctr_2 + 18*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_319 = _data_pdfs + _stride_pdfs_2*ctr_2 + 19*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_321 = _data_pdfs + _stride_pdfs_2*ctr_2 + 21*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_323 = _data_pdfs + _stride_pdfs_2*ctr_2 + 23*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_325 = _data_pdfs + _stride_pdfs_2*ctr_2 + 25*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_34 = _data_pdfs + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_38 = _data_pdfs + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_313 = _data_pdfs + _stride_pdfs_2*ctr_2 + 13*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_317 = _data_pdfs + _stride_pdfs_2*ctr_2 + 17*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_320 = _data_pdfs + _stride_pdfs_2*ctr_2 + 20*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_322 = _data_pdfs + _stride_pdfs_2*ctr_2 + 22*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_324 = _data_pdfs + _stride_pdfs_2*ctr_2 + 24*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_326 = _data_pdfs + _stride_pdfs_2*ctr_2 + 26*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_33 = _data_pdfs + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_37 = _data_pdfs + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_39 = _data_pdfs + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_31 = _data_pdfs + _stride_pdfs_2*ctr_2 + _stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_311 = _data_pdfs + _stride_pdfs_2*ctr_2 + 11*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_315 = _data_pdfs + _stride_pdfs_2*ctr_2 + 15*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_312 = _data_pdfs + _stride_pdfs_2*ctr_2 + 12*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_316 = _data_pdfs + _stride_pdfs_2*ctr_2 + 16*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_32 = _data_pdfs + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_35 = _data_pdfs + _stride_pdfs_2*ctr_2 + 5*_stride_pdfs_3;
+      double * RESTRICT _data_pdfs_20_30 = _data_pdfs + _stride_pdfs_2*ctr_2;
+      double * RESTRICT _data_pdfs_20_36 = _data_pdfs + _stride_pdfs_2*ctr_2 + 6*_stride_pdfs_3;
+      double * RESTRICT  _data_density_20_30 = _data_density + _stride_density_2*ctr_2;
+      double * RESTRICT  _data_velocity_20_30 = _data_velocity + _stride_velocity_2*ctr_2;
+      double * RESTRICT  _data_velocity_20_31 = _data_velocity + _stride_velocity_2*ctr_2 + _stride_velocity_3;
+      double * RESTRICT  _data_velocity_20_32 = _data_velocity + _stride_velocity_2*ctr_2 + 2*_stride_velocity_3;
+      for (int64_t ctr_1 = 0; ctr_1 < _size_density_1; ctr_1 += 1)
+      {
+         double * RESTRICT _data_pdfs_20_310_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_310;
+         double * RESTRICT _data_pdfs_20_314_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_314;
+         double * RESTRICT _data_pdfs_20_318_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_318;
+         double * RESTRICT _data_pdfs_20_319_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_319;
+         double * RESTRICT _data_pdfs_20_321_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_321;
+         double * RESTRICT _data_pdfs_20_323_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_323;
+         double * RESTRICT _data_pdfs_20_325_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_325;
+         double * RESTRICT _data_pdfs_20_34_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_34;
+         double * RESTRICT _data_pdfs_20_38_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_38;
+         double * RESTRICT _data_pdfs_20_313_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_313;
+         double * RESTRICT _data_pdfs_20_317_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_317;
+         double * RESTRICT _data_pdfs_20_320_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_320;
+         double * RESTRICT _data_pdfs_20_322_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_322;
+         double * RESTRICT _data_pdfs_20_324_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_324;
+         double * RESTRICT _data_pdfs_20_326_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_326;
+         double * RESTRICT _data_pdfs_20_33_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_33;
+         double * RESTRICT _data_pdfs_20_37_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_37;
+         double * RESTRICT _data_pdfs_20_39_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_39;
+         double * RESTRICT _data_pdfs_20_31_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_31;
+         double * RESTRICT _data_pdfs_20_311_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_311;
+         double * RESTRICT _data_pdfs_20_315_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_315;
+         double * RESTRICT _data_pdfs_20_312_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_312;
+         double * RESTRICT _data_pdfs_20_316_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_316;
+         double * RESTRICT _data_pdfs_20_32_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_32;
+         double * RESTRICT _data_pdfs_20_35_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_35;
+         double * RESTRICT _data_pdfs_20_30_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_30;
+         double * RESTRICT _data_pdfs_20_36_10 = _stride_pdfs_1*ctr_1 + _data_pdfs_20_36;
+         double * RESTRICT  _data_density_20_30_10 = _stride_density_1*ctr_1 + _data_density_20_30;
+         double * RESTRICT  _data_velocity_20_30_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_30;
+         double * RESTRICT  _data_velocity_20_31_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_31;
+         double * RESTRICT  _data_velocity_20_32_10 = _stride_velocity_1*ctr_1 + _data_velocity_20_32;
+         for (int64_t ctr_0 = 0; ctr_0 < _size_density_0; ctr_0 += 1)
+         {
+            const double vel0Term = _data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_34_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_0 = vel0Term - 1.0*_data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_37_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double vel1Term = _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_31_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_37_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_1 = vel1Term - 1.0*_data_pdfs_20_310_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_321_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_39_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_38_10[_stride_pdfs_0*ctr_0];
+            const double vel2Term = _data_pdfs_20_312_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_313_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_322_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_35_10[_stride_pdfs_0*ctr_0];
+            const double delta_rho = vel0Term + vel1Term + vel2Term + _data_pdfs_20_30_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_32_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_33_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_39_10[_stride_pdfs_0*ctr_0];
+            const double momdensity_2 = vel2Term - 1.0*_data_pdfs_20_315_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_316_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_317_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_318_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_323_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_324_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_325_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_326_10[_stride_pdfs_0*ctr_0] - 1.0*_data_pdfs_20_36_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_311_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_314_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_319_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_320_10[_stride_pdfs_0*ctr_0] + _data_pdfs_20_321_10[_stride_pdfs_0*ctr_0];
+            const double rho = delta_rho + 1.0;
+            const double u_0 = momdensity_0;
+            const double u_1 = momdensity_1;
+            const double u_2 = momdensity_2;
+            _data_density_20_30_10[_stride_density_0*ctr_0] = rho;
+            _data_velocity_20_30_10[_stride_velocity_0*ctr_0] = u_0;
+            _data_velocity_20_31_10[_stride_velocity_0*ctr_0] = u_1;
+            _data_velocity_20_32_10[_stride_velocity_0*ctr_0] = u_2;
+         }
+      }
+   }
+}
+}
+
+
+
+
+
+void D3Q27SRT::streamCollide( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_streamCollide::d3q27srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega);
+}
+void D3Q27SRT::streamCollideCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_streamCollide::d3q27srt_kernel_streamCollide(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3, omega);
+}
+
+void D3Q27SRT::collide( field::GhostLayerField<double, 27> * pdfs, double omega, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   internal_d3q27srt_kernel_collide::d3q27srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+void D3Q27SRT::collideCellInterval( field::GhostLayerField<double, 27> * pdfs, double omega, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   internal_d3q27srt_kernel_collide::d3q27srt_kernel_collide(_data_pdfs, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, omega);
+}
+
+void D3Q27SRT::stream( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_stream::d3q27srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+void D3Q27SRT::streamCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_stream::d3q27srt_kernel_stream(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+
+void D3Q27SRT::streamOnlyNoAdvancement( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(pdfs->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(pdfs->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(pdfs->zSize()) + 2*ghost_layers);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_streamOnlyNoAdvancement::d3q27srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+void D3Q27SRT::streamOnlyNoAdvancementCellInterval( field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs_tmp->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs_tmp = pdfs_tmp->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_pdfs_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_pdfs_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(pdfs->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_pdfs_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
+   const int64_t _stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
+   const int64_t _stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
+   const int64_t _stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
+   internal_d3q27srt_kernel_streamOnlyNoAdvancement::d3q27srt_kernel_streamOnlyNoAdvancement(_data_pdfs, _data_pdfs_tmp, _size_pdfs_0, _size_pdfs_1, _size_pdfs_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_pdfs_tmp_0, _stride_pdfs_tmp_1, _stride_pdfs_tmp_2, _stride_pdfs_tmp_3);
+}
+
+void D3Q27SRT::initialise( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT const _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT const _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers))
+   const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers))
+   const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers))
+   const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q27srt_kernel_initialise::d3q27srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+void D3Q27SRT::initialiseCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT const _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT  _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT const _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q27srt_kernel_initialise::d3q27srt_kernel_initialise(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+
+void D3Q27SRT::calculateMacroscopicParameters( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers )
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT  _data_density = density->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(-ghost_layers, -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT  _data_velocity = velocity->dataAt(-ghost_layers, -ghost_layers, -ghost_layers, 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(density->xSize()) + 2*ghost_layers))
+   const int64_t _size_density_0 = int64_t(int64_c(density->xSize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(density->ySize()) + 2*ghost_layers))
+   const int64_t _size_density_1 = int64_t(int64_c(density->ySize()) + 2*ghost_layers);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(density->zSize()) + 2*ghost_layers))
+   const int64_t _size_density_2 = int64_t(int64_c(density->zSize()) + 2*ghost_layers);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q27srt_kernel_getter::d3q27srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+void D3Q27SRT::calculateMacroscopicParametersCellInterval( field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci)
+{
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(density->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(density->nrOfGhostLayers()))
+   double * RESTRICT  _data_density = density->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(pdfs->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(pdfs->nrOfGhostLayers()))
+   double * RESTRICT const _data_pdfs = pdfs->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.xMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.yMin(), -int_c(velocity->nrOfGhostLayers()))
+   WALBERLA_ASSERT_GREATER_EQUAL(ci.zMin(), -int_c(velocity->nrOfGhostLayers()))
+   double * RESTRICT  _data_velocity = velocity->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->xSizeWithGhostLayer(), int64_t(int64_c(ci.xSize()) + 0))
+   const int64_t _size_density_0 = int64_t(int64_c(ci.xSize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->ySizeWithGhostLayer(), int64_t(int64_c(ci.ySize()) + 0))
+   const int64_t _size_density_1 = int64_t(int64_c(ci.ySize()) + 0);
+   WALBERLA_ASSERT_GREATER_EQUAL(density->zSizeWithGhostLayer(), int64_t(int64_c(ci.zSize()) + 0))
+   const int64_t _size_density_2 = int64_t(int64_c(ci.zSize()) + 0);
+   const int64_t _stride_density_0 = int64_t(density->xStride());
+   const int64_t _stride_density_1 = int64_t(density->yStride());
+   const int64_t _stride_density_2 = int64_t(density->zStride());
+   const int64_t _stride_pdfs_0 = int64_t(pdfs->xStride());
+   const int64_t _stride_pdfs_1 = int64_t(pdfs->yStride());
+   const int64_t _stride_pdfs_2 = int64_t(pdfs->zStride());
+   const int64_t _stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
+   const int64_t _stride_velocity_0 = int64_t(velocity->xStride());
+   const int64_t _stride_velocity_1 = int64_t(velocity->yStride());
+   const int64_t _stride_velocity_2 = int64_t(velocity->zStride());
+   const int64_t _stride_velocity_3 = int64_t(1 * int64_t(velocity->fStride()));
+   internal_d3q27srt_kernel_getter::d3q27srt_kernel_getter(_data_density, _data_pdfs, _data_velocity, _size_density_0, _size_density_1, _size_density_2, _stride_density_0, _stride_density_1, _stride_density_2, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, _stride_pdfs_3, _stride_velocity_0, _stride_velocity_1, _stride_velocity_2, _stride_velocity_3);
+}
+
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning pop
+#endif
\ No newline at end of file
diff --git a/src/lbm_generated/sweep_collection/D3Q27SRT.h b/src/lbm_generated/sweep_collection/D3Q27SRT.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb45b71660fbf902d16cd064e2f09dadf24548d7
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/D3Q27SRT.h
@@ -0,0 +1,1131 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file D3Q27SRT.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+#include "core/Macros.h"
+
+
+
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include "field/SwapableCompare.h"
+#include "field/GhostLayerField.h"
+
+#include <set>
+#include <cmath>
+
+
+
+using namespace std::placeholders;
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace lbm {
+
+
+class D3Q27SRT
+{
+public:
+  enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+
+   D3Q27SRT(const shared_ptr< StructuredBlockStorage > & blocks, BlockDataID pdfsID_, BlockDataID densityID_, BlockDataID velocityID_, double omega, const Cell & outerWidth=Cell(1, 1, 1))
+     : blocks_(blocks), pdfsID(pdfsID_), densityID(densityID_), velocityID(velocityID_), omega_(omega), outerWidth_(outerWidth)
+   {
+      
+
+      for (auto& iBlock : *blocks)
+      {
+         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
+             int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
+             int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
+          WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+      }
+   };
+
+   
+    ~D3Q27SRT() {  
+        for(auto p: cache_pdfs_) {
+            delete p;
+        }
+     }
+
+
+   /*************************************************************************************
+   *                Internal Function Definitions with raw Pointer
+   *************************************************************************************/
+   static void streamCollide (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const cell_idx_t ghost_layers = 0);
+   static void streamCollideCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, double omega, const CellInterval & ci);
+   
+   static void collide (field::GhostLayerField<double, 27> * pdfs, double omega, const cell_idx_t ghost_layers = 0);
+   static void collideCellInterval (field::GhostLayerField<double, 27> * pdfs, double omega, const CellInterval & ci);
+   
+   static void stream (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers = 0);
+   static void streamCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci);
+   
+   static void streamOnlyNoAdvancement (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const cell_idx_t ghost_layers = 0);
+   static void streamOnlyNoAdvancementCellInterval (field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 27> * pdfs_tmp, const CellInterval & ci);
+   
+   static void initialise (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0);
+   static void initialiseCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci);
+   
+   static void calculateMacroscopicParameters (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const cell_idx_t ghost_layers = 0);
+   static void calculateMacroscopicParametersCellInterval (field::GhostLayerField<double, 1> * density, field::GhostLayerField<double, 27> * pdfs, field::GhostLayerField<double, 3> * velocity, const CellInterval & ci);
+   
+
+   /*************************************************************************************
+   *                Function Definitions for external Usage
+   *************************************************************************************/
+
+   std::function<void (IBlock *)> streamCollide()
+   {
+      return [this](IBlock* block) { streamCollide(block); };
+   }
+
+   std::function<void (IBlock *)> streamCollide(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamCollideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamCollideOuter(block); };
+         default:
+            return [this](IBlock* block) { streamCollide(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> streamCollide(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamCollideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamCollideOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { streamCollide(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void streamCollide(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollide(pdfs, pdfs_tmp, omega, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamCollide(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollide(pdfs, pdfs_tmp, omega, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   
+
+   void streamCollideCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+      streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamCollideInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamCollideCellInterval(pdfs, pdfs_tmp, omega, inner);
+   }
+
+   void streamCollideOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      auto & omega = this->omega_;
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamCollideCellInterval(pdfs, pdfs_tmp, omega, ci);
+      }
+    
+
+    pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+   
+
+   std::function<void (IBlock *)> collide()
+   {
+      return [this](IBlock* block) { collide(block); };
+   }
+
+   std::function<void (IBlock *)> collide(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { collideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { collideOuter(block); };
+         default:
+            return [this](IBlock* block) { collide(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> collide(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { collideInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { collideOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { collide(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void collide(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collide(pdfs, omega, ghost_layers);
+      
+   }
+
+   void collide(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collide(pdfs, omega, ghost_layers);
+      
+   }
+
+   
+
+   void collideCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+      collideCellInterval(pdfs, omega, ci);
+      
+   }
+
+   void collideInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      collideCellInterval(pdfs, omega, inner);
+   }
+
+   void collideOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+
+      auto & omega = this->omega_;
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         collideCellInterval(pdfs, omega, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> stream()
+   {
+      return [this](IBlock* block) { stream(block); };
+   }
+
+   std::function<void (IBlock *)> stream(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOuter(block); };
+         default:
+            return [this](IBlock* block) { stream(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> stream(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { stream(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void stream(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      stream(pdfs, pdfs_tmp, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void stream(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      stream(pdfs, pdfs_tmp, ghost_layers);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   
+
+   void streamCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamCellInterval(pdfs, pdfs_tmp, ci);
+      pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+
+   void streamInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamCellInterval(pdfs, pdfs_tmp, inner);
+   }
+
+   void streamOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamCellInterval(pdfs, pdfs_tmp, ci);
+      }
+    
+
+    pdfs->swapDataPointers(pdfs_tmp);
+
+   }
+   
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement()
+   {
+      return [this](IBlock* block) { streamOnlyNoAdvancement(block); };
+   }
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); };
+         default:
+            return [this](IBlock* block) { streamOnlyNoAdvancement(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> streamOnlyNoAdvancement(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { streamOnlyNoAdvancementOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { streamOnlyNoAdvancement(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void streamOnlyNoAdvancement(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers);
+      
+   }
+
+   void streamOnlyNoAdvancement(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancement(pdfs, pdfs_tmp, ghost_layers);
+      
+   }
+
+   
+
+   void streamOnlyNoAdvancementCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+      streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci);
+      
+   }
+
+   void streamOnlyNoAdvancementInner(IBlock * block)
+   {
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      CellInterval inner = pdfs->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, inner);
+   }
+
+   void streamOnlyNoAdvancementOuter(IBlock * block)
+   {
+
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      field::GhostLayerField<double, 27> * pdfs_tmp;
+      {
+          // Getting temporary field pdfs_tmp
+          auto it = cache_pdfs_.find( pdfs );
+          if( it != cache_pdfs_.end() )
+          {
+              pdfs_tmp = *it;
+          }
+          else
+          {
+              pdfs_tmp = pdfs->cloneUninitialized();
+              cache_pdfs_.insert(pdfs_tmp);
+          }
+      }
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         pdfs->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         pdfs->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         pdfs->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         streamOnlyNoAdvancementCellInterval(pdfs, pdfs_tmp, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> initialise()
+   {
+      return [this](IBlock* block) { initialise(block); };
+   }
+
+   std::function<void (IBlock *)> initialise(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { initialiseInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { initialiseOuter(block); };
+         default:
+            return [this](IBlock* block) { initialise(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> initialise(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { initialiseInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { initialiseOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { initialise(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void initialise(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialise(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   void initialise(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialise(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   
+
+   void initialiseCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      initialiseCellInterval(density, pdfs, velocity, ci);
+      
+   }
+
+   void initialiseInner(IBlock * block)
+   {
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      CellInterval inner = density->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      initialiseCellInterval(density, pdfs, velocity, inner);
+   }
+
+   void initialiseOuter(IBlock * block)
+   {
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         initialiseCellInterval(density, pdfs, velocity, ci);
+      }
+    
+
+    
+   }
+   
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters()
+   {
+      return [this](IBlock* block) { calculateMacroscopicParameters(block); };
+   }
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters(Type type)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { calculateMacroscopicParametersInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); };
+         default:
+            return [this](IBlock* block) { calculateMacroscopicParameters(block); };
+      }
+   }
+
+   std::function<void (IBlock *)> calculateMacroscopicParameters(Type type, const cell_idx_t ghost_layers)
+   {
+      switch (type)
+      {
+         case Type::INNER:
+            return [this](IBlock* block) { calculateMacroscopicParametersInner(block); };
+         case Type::OUTER:
+            return [this](IBlock* block) { calculateMacroscopicParametersOuter(block); };
+         default:
+            return [this, ghost_layers](IBlock* block) { calculateMacroscopicParameters(block, ghost_layers); };
+      }
+   }
+
+   
+
+   void calculateMacroscopicParameters(IBlock * block)
+   {
+      const cell_idx_t ghost_layers = 0;
+      
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   void calculateMacroscopicParameters(IBlock * block, const cell_idx_t ghost_layers)
+   {
+      
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParameters(density, pdfs, velocity, ghost_layers);
+      
+   }
+
+   
+
+   void calculateMacroscopicParametersCellInterval(IBlock * block, const CellInterval & ci)
+   {
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+      calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci);
+      
+   }
+
+   void calculateMacroscopicParametersInner(IBlock * block)
+   {
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      CellInterval inner = density->xyzSize();
+      inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+      calculateMacroscopicParametersCellInterval(density, pdfs, velocity, inner);
+   }
+
+   void calculateMacroscopicParametersOuter(IBlock * block)
+   {
+
+      auto velocity = block->getData< field::GhostLayerField<double, 3> >(velocityID);
+      auto pdfs = block->getData< field::GhostLayerField<double, 27> >(pdfsID);
+      auto density = block->getData< field::GhostLayerField<double, 1> >(densityID);
+
+      
+      
+
+      if( layers_.empty() )
+      {
+         CellInterval ci;
+
+         density->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+         ci.expand(Cell(0, 0, -outerWidth_[2]));
+         layers_.push_back(ci);
+
+         density->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+         density->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+         ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+         layers_.push_back(ci);
+      }
+
+    
+      for( auto & ci: layers_ )
+      {
+         calculateMacroscopicParametersCellInterval(density, pdfs, velocity, ci);
+      }
+    
+
+    
+   }
+   
+
+   
+
+   private:
+      shared_ptr< StructuredBlockStorage > blocks_;
+      BlockDataID pdfsID;
+    BlockDataID densityID;
+    BlockDataID velocityID;
+    double omega_;
+
+    private: std::set< field::GhostLayerField<double, 27> *, field::SwapableCompare< field::GhostLayerField<double, 27> * > > cache_pdfs_;
+
+      Cell outerWidth_;
+      std::vector<CellInterval> layers_;
+
+      
+};
+
+
+} // namespace lbm
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py b/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdc208608f08bf202d361b5c369d48199c5c5ed4
--- /dev/null
+++ b/src/lbm_generated/sweep_collection/sweep_collection_generation_script.py
@@ -0,0 +1,48 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import ManualCodeGenerationContext, generate_info_header
+from lbmpy_walberla import generate_lbm_sweep_collection
+
+
+with ManualCodeGenerationContext(openmp=False, optimize_for_localhost=False,
+                                 mpi=True, double_accuracy=True, cuda=False) as ctx:
+
+    for stencil in [LBStencil(Stencil.D3Q19), LBStencil(Stencil.D3Q27)]:
+        target = Target.GPU if ctx.cuda else Target.CPU
+        data_type = "float64" if ctx.double_accuracy else "float32"
+        openmp = True if ctx.openmp else False
+        if ctx.optimize_for_localhost:
+            cpu_vec = {"nontemporal": False, "assume_aligned": True}
+        else:
+            cpu_vec = None
+
+        method = Method.SRT
+        relaxation_rate = sp.symbols("omega")
+        streaming_pattern = 'pull'
+
+        pdfs = fields(f"pdfs({stencil.Q}): {data_type}[{stencil.D}D]", layout='fzyx')
+        density_field, velocity_field = fields(f"density(1), velocity({stencil.D}): {data_type}[{stencil.D}D]",
+                                               layout='fzyx')
+
+        macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+        lbm_config = LBMConfig(stencil=stencil, method=method, relaxation_rate=relaxation_rate,
+                               streaming_pattern=streaming_pattern)
+        lbm_opt = LBMOptimisation(cse_global=False)
+
+        collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+        generate_lbm_sweep_collection(ctx, f'{stencil.name}{method.name}', collision_rule,
+                                      streaming_pattern='pull',
+                                      field_layout='zyxf',
+                                      refinement_scaling=None,
+                                      macroscopic_fields=macroscopic_fields,
+                                      target=target, data_type=data_type,
+                                      cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
+
+        ctx.write_all_files()
diff --git a/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h b/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h
index eeba79e77a31c0989ea397c859babb05c6eca8a0..215c03f3a543ea81545b5e291187bfce15e66734 100644
--- a/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h
+++ b/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h
@@ -80,7 +80,8 @@ public:
 
    inline CurvedLinear( const BoundaryUID & boundaryUID, const FlagUID & uid, PDFField_T * const pdfField, const FlagField_T * const flagField,
                         ParticleField_T * const particleField, const shared_ptr<ParticleAccessor_T>& ac,
-                        const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block );
+                        const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block,
+                        std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct = nullptr );
 
    void pushFlags( std::vector< FlagUID >& uids ) const { uids.push_back( uid_ ); }
 
@@ -119,6 +120,8 @@ private:
    const StructuredBlockStorage & blockStorage_;
    const IBlock & block_;
 
+   std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct_;
+
    real_t lengthScalingFactor_;
    real_t forceScalingFactor_;
 
@@ -128,8 +131,10 @@ private:
 template< typename LatticeModel_T, typename FlagField_T, typename ParticleAccessor_T >
 inline CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >::CurvedLinear( const BoundaryUID & boundaryUID, const FlagUID & uid, PDFField_T * const pdfField, const FlagField_T * const flagField,
                                                                                       ParticleField_T * const particleField, const shared_ptr<ParticleAccessor_T>& ac,
-                                                                                      const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block ):
-Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), flagField_( flagField ), particleField_( particleField ), ac_( ac ), domainMask_(domain), blockStorage_( blockStorage ), block_( block )
+                                                                                      const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block,
+                                                                                      std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct ):
+Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), flagField_( flagField ), particleField_( particleField ),
+ac_( ac ), domainMask_(domain), blockStorage_( blockStorage ), block_( block ), hydrostaticDensityFct_(hydrostaticDensityFct)
 {
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    WALBERLA_ASSERT_NOT_NULLPTR( flagField_ );
@@ -159,12 +164,12 @@ inline void CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >::tre
    WALBERLA_ASSERT_UNEQUAL( mask & this->mask_, numeric_cast<flag_t>(0) );
    WALBERLA_ASSERT_EQUAL  ( mask & this->mask_, this->mask_ ); // only true if "this->mask_" only contains one single flag, which is the case for the
                                                                // current implementation of this boundary condition
-   WALBERLA_ASSERT_UNEQUAL( particleField_->get(nx,ny,nz), ac_->getInvalidUid() );
 
    // determine distance to real boundary, i.e. delta value
    // cell center of the near-boundary fluid cell
    Cell nearBoundaryCell(x,y,z);
    Vector3< real_t > cellCenter = blockStorage_.getBlockLocalCellCenter(block_, nearBoundaryCell);
+   WALBERLA_ASSERT_UNEQUAL( particleField_->get(nx,ny,nz), ac_->getInvalidUid(), x << " " << y << " " << z  << " -> " << nx << " " << ny << " " << nz << " " << cellCenter);
 
    // direction of the ray (from the fluid cell center to the boundary cell)
    Vector3< real_t > direction( lengthScalingFactor_ * real_c( stencil::cx[ dir ] ),
@@ -173,7 +178,7 @@ inline void CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >::tre
 
    //get particle index
    auto particleIdx = ac_->uidToIdx(particleField_->get( nx, ny, nz ));
-   WALBERLA_ASSERT_UNEQUAL( particleIdx, ac_->getInvalidIdx(), "Index of particle is invalid!" );
+   WALBERLA_ASSERT_UNEQUAL( particleIdx, ac_->getInvalidIdx(), "Index of particle is invalid! " << x << " " << y << " " << z  << " -> " << nx << " " << ny << " " << nz << " " << cellCenter << " UID:" << particleField_->get( nx, ny, nz ) );
 
    real_t delta = real_t(0);
    real_t pdf_new = real_t(0);
@@ -268,8 +273,17 @@ inline void CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >::tre
       // as a consequence, some (non-zero) PDF contributions would be missing after summing up the force contributions
       // those would need to be added artificially, see e.g. Ernst, Dietzel, Sommerfeld - A lattice Boltzmann method for simulating transport and agglomeration of resolved particles, Acta Mech, 2013
       // instead, we use the trick there that we just require the deviations from the equilibrium to get the correct force as it is already used for the incompressible case
-      pdf_old -= LatticeModel_T::w[ Stencil_T::idx[dir] ];
-      pdf_new -= LatticeModel_T::w[ Stencil_T::idx[dir] ];
+      if (hydrostaticDensityFct_ == nullptr)
+      {
+         pdf_old -= LatticeModel_T::w[Stencil_T::idx[dir]];
+         pdf_new -= LatticeModel_T::w[Stencil_T::idx[dir]];
+      }
+      else
+      {
+         const real_t rhoHydStat = hydrostaticDensityFct_(cellCenter);
+         pdf_old -= rhoHydStat * LatticeModel_T::w[Stencil_T::idx[dir]];
+         pdf_new -= rhoHydStat * LatticeModel_T::w[Stencil_T::idx[dir]];
+      }
    }
 
    // MEM: F = pdf_old + pdf_new - common
diff --git a/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h b/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h
index aeafb2b255fd965b248c71ee71c56b3c27972348..7d5c528489cae9578b34586238f282ba2dd0df7a 100644
--- a/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h
+++ b/src/lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h
@@ -72,7 +72,8 @@ public:
 
    inline SimpleBB( const BoundaryUID & boundaryUID, const FlagUID & uid, PDFField_T * const pdfField, const FlagField_T * const flagField,
                     ParticleField_T * const particleField, const shared_ptr<ParticleAccessor_T>& ac,
-                    const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block );
+                    const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block,
+                    std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct = nullptr );
 
    void pushFlags( std::vector< FlagUID >& uids ) const { uids.push_back( uid_ ); }
 
@@ -110,6 +111,8 @@ private:
    const StructuredBlockStorage & blockStorage_;
    const IBlock & block_;
 
+   std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct_;
+
    real_t lengthScalingFactor_;
    real_t forceScalingFactor_;
 
@@ -119,8 +122,10 @@ private:
 template< typename LatticeModel_T, typename FlagField_T, typename ParticleAccessor_T >
 inline SimpleBB< LatticeModel_T, FlagField_T, ParticleAccessor_T >::SimpleBB( const BoundaryUID & boundaryUID, const FlagUID & uid, PDFField_T * const pdfField, const FlagField_T * const flagField,
                                                                               ParticleField_T * const particleField, const shared_ptr<ParticleAccessor_T>& ac,
-                                                                              const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block ):
-Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), flagField_( flagField ), particleField_( particleField ), ac_( ac ), domainMask_(domain), blockStorage_( blockStorage ), block_( block )
+                                                                              const flag_t domain, const StructuredBlockStorage & blockStorage, const IBlock & block,
+                                                                              std::function<real_t(const Vector3<real_t>&)> hydrostaticDensityFct):
+Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), flagField_( flagField ), particleField_( particleField ), ac_( ac ), domainMask_(domain),
+blockStorage_( blockStorage ), block_( block ), hydrostaticDensityFct_(hydrostaticDensityFct)
 {
    WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
    WALBERLA_ASSERT_NOT_NULLPTR( flagField_ );
@@ -213,8 +218,17 @@ inline void SimpleBB< LatticeModel_T, FlagField_T, ParticleAccessor_T >::treatDi
       // as a consequence, some (non-zero) PDF contributions would be missing after summing up the force contributions
       // those would need to be added artificially, see e.g. Ernst, Dietzel, Sommerfeld - A lattice Boltzmann method for simulating transport and agglomeration of resolved particles, Acta Mech, 2013
       // instead, we use the trick there that we just require the deviations from the equilibrium to get the correct force as it is already used for the incompressible case
-      pdf_old -= LatticeModel_T::w[ Stencil_T::idx[dir] ];
-      pdf_new -= LatticeModel_T::w[ Stencil_T::idx[dir] ];
+       if (hydrostaticDensityFct_ == nullptr)
+       {
+          pdf_old -= LatticeModel_T::w[Stencil_T::idx[dir]];
+          pdf_new -= LatticeModel_T::w[Stencil_T::idx[dir]];
+       }
+       else
+       {
+          const real_t rhoHydStat = hydrostaticDensityFct_(cellCenter);
+          pdf_old -= rhoHydStat * LatticeModel_T::w[Stencil_T::idx[dir]];
+          pdf_new -= rhoHydStat * LatticeModel_T::w[Stencil_T::idx[dir]];
+       }
    }
 
    // MEM: F = pdf_old + pdf_new - common
diff --git a/src/mesh/boundary/BoundarySetup.cpp b/src/mesh/boundary/BoundarySetup.cpp
index c0edcc1ed81ce102e05eefb1556825730d39b582..e4d443b5378f24f7986c29e6e7e2d901bf58c74d 100644
--- a/src/mesh/boundary/BoundarySetup.cpp
+++ b/src/mesh/boundary/BoundarySetup.cpp
@@ -89,7 +89,7 @@ void BoundarySetup::allocateOrResetVoxelizationField()
    }
    else
    {
-      voxelizationFieldId_ = make_shared< BlockDataID >( field::addToStorage< VoxelizationField >( structuredBlockStorage_, "voxelization field", uint8_t(0), field::zyxf, numGhostLayers_ ) );
+      voxelizationFieldId_ = make_shared< BlockDataID >( field::addToStorage< VoxelizationField >( structuredBlockStorage_, "voxelization field", uint8_t(0), field::fzyx, numGhostLayers_ ) );
    }
 
    WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ );
diff --git a/src/pde/boundary/Dirichlet.h b/src/pde/boundary/Dirichlet.h
index f5543fdbe95ca04f90bf559604984c1f268a1d3f..929d2b5ba39517a84cccae6329271dc5f2dbf756 100644
--- a/src/pde/boundary/Dirichlet.h
+++ b/src/pde/boundary/Dirichlet.h
@@ -167,7 +167,7 @@ inline Dirichlet< Stencil_T, flag_t >::Dirichlet( const BoundaryUID & boundaryUI
    )
 #endif
 
-   dirichletBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::zyxf );
+   dirichletBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::fzyx );
 
 }
 
diff --git a/src/pde/boundary/Dirichlet_withDx.h b/src/pde/boundary/Dirichlet_withDx.h
index c119206926c31891a3d17b73c52e02ec1f92ee3b..93d2dd42e1ae1879b04ec553ce25fc03c55c1996 100644
--- a/src/pde/boundary/Dirichlet_withDx.h
+++ b/src/pde/boundary/Dirichlet_withDx.h
@@ -133,7 +133,7 @@ inline Dirichlet< Stencil_T, flag_t >::Dirichlet( const BoundaryUID & boundaryUI
 
    WALBERLA_ASSERT_EQUAL( rhsField_->xyzSize(), stencilField_->xyzSize() );
 
-   dirichletBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::zyxf );
+   dirichletBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::fzyx );
 
    for(auto d = Stencil_T::beginNoCenter(); d != Stencil_T::end(); ++d ){
       dx_[d.toIdx()] = Vector3<real_t>(stencil::cx[d.toIdx()]*blocks.dx(), stencil::cy[d.toIdx()]*blocks.dy(), stencil::cz[d.toIdx()]*blocks.dz() ).sqrLength();
diff --git a/src/pde/boundary/Neumann.h b/src/pde/boundary/Neumann.h
index ba5fb0e8c7e09ee039a793aee5f1623a10053250..76988b2d47d5759d70b04d465eedb2f349bc65c8 100644
--- a/src/pde/boundary/Neumann.h
+++ b/src/pde/boundary/Neumann.h
@@ -350,7 +350,7 @@ inline Neumann< Stencil_T, flag_t >::Neumann( const BoundaryUID & boundaryUID, c
    )
 #endif
 
-   neumannBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::zyxf );
+   neumannBC_ = make_shared< Field_T >( rhsField_->xSize(), rhsField_->ySize(), rhsField_->zSize(), uint_t(1), field::fzyx );
 
    for(auto d = Stencil_T::beginNoCenter(); d != Stencil_T::end(); ++d ){
       dx_[d.toIdx()] = Vector3<real_t>(real_c(stencil::cx[d.toIdx()])*blocks.dx(), real_c(stencil::cy[d.toIdx()])*blocks.dy(), real_c(stencil::cz[d.toIdx()])*blocks.dz() ).sqrLength();
diff --git a/src/pde/iterations/VCycles.impl.h b/src/pde/iterations/VCycles.impl.h
index 8cf132a140c9f55b080b795d9135590a99e25a41..2555b7071208e72004b421cdd2ffaa8b62a4ded9 100644
--- a/src/pde/iterations/VCycles.impl.h
+++ b/src/pde/iterations/VCycles.impl.h
@@ -60,7 +60,7 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
    // Set up fields for finest level
    uId_.push_back( uFieldId );
    fId_.push_back( fFieldId );
-   rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_0", real_t(0), field::zyxf, uint_t(1) ) );
+   rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_0", real_t(0), field::fzyx, uint_t(1) ) );
 
    // Check that coarsest grid has more than one cell per dimension
    auto   block = blocks->begin();
@@ -91,9 +91,9 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
    for ( uint_t lvl = 1; lvl < numLvl; ++lvl )
    {
       auto getSize = std::bind(VCycles<Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T>::getSizeForLevel, lvl, std::placeholders::_1, std::placeholders::_2);
-      uId_.push_back( field::addToStorage< PdeField_T >( blocks, "u_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
-      fId_.push_back( field::addToStorage< PdeField_T >( blocks, "f_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
-      rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
+      uId_.push_back( field::addToStorage< PdeField_T >( blocks, "u_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
+      fId_.push_back( field::addToStorage< PdeField_T >( blocks, "f_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
+      rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
 
       for ( auto & w: weights_[lvl] )
       {
@@ -104,8 +104,8 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
 
    // Set up fields for CG on coarsest level
    auto getFineSize = std::bind(VCycles<Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T>::getSizeForLevel, numLvl-1, std::placeholders::_1, std::placeholders::_2);
-   dId_ = field::addToStorage< PdeField_T >( blocks, "d", getFineSize, real_t(0), field::zyxf, uint_t(1) );
-   zId_ = field::addToStorage< PdeField_T >( blocks, "z", getFineSize, real_t(0), field::zyxf, uint_t(1) );
+   dId_ = field::addToStorage< PdeField_T >( blocks, "d", getFineSize, real_t(0), field::fzyx, uint_t(1) );
+   zId_ = field::addToStorage< PdeField_T >( blocks, "z", getFineSize, real_t(0), field::fzyx, uint_t(1) );
 
    // Set up communication
    for ( uint_t lvl = 0; lvl < numLvl-1; ++lvl )
@@ -167,7 +167,7 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
    // Set up fields for finest level
    uId_.push_back( uFieldId );
    fId_.push_back( fFieldId );
-   rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_0", real_t(0), field::zyxf, uint_t(1) ) );
+   rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_0", real_t(0), field::fzyx, uint_t(1) ) );
    stencilId_.push_back( stencilFieldId );
 
    // Check that coarsest grid has more than one cell per dimension
@@ -197,10 +197,10 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
    for ( uint_t lvl = 1; lvl < numLvl; ++lvl )
    {
       auto getSize = std::bind(VCycles<Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T>::getSizeForLevel, lvl, std::placeholders::_1, std::placeholders::_2);
-      uId_.push_back( field::addToStorage< PdeField_T >( blocks, "u_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
-      fId_.push_back( field::addToStorage< PdeField_T >( blocks, "f_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
-      rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
-      stencilId_.push_back( field::addToStorage< StencilField_T >( blocks, "w_"+std::to_string(lvl), getSize, real_t(0), field::zyxf, uint_t(1) ) );
+      uId_.push_back( field::addToStorage< PdeField_T >( blocks, "u_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
+      fId_.push_back( field::addToStorage< PdeField_T >( blocks, "f_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
+      rId_.push_back( field::addToStorage< PdeField_T >( blocks, "r_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
+      stencilId_.push_back( field::addToStorage< StencilField_T >( blocks, "w_"+std::to_string(lvl), getSize, real_t(0), field::fzyx, uint_t(1) ) );
    }
 
    // CoarsenStencilFieldsDCA<Stencil_T>( blocks, stencilId_, numLvl, uint_t(2)) ();  // scaling by ( 1/h^2 )^lvl
@@ -209,8 +209,8 @@ VCycles< Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T >::
 
    // Set up fields for CG on coarsest level
    auto getFineSize = std::bind(VCycles<Stencil_T, OperatorCoarsening_T, Restrict_T, ProlongateAndCorrect_T>::getSizeForLevel, numLvl-1, std::placeholders::_1, std::placeholders::_2);
-   dId_ = field::addToStorage< PdeField_T >( blocks, "d", getFineSize, real_t(0), field::zyxf, uint_t(1) );
-   zId_ = field::addToStorage< PdeField_T >( blocks, "z", getFineSize, real_t(0), field::zyxf, uint_t(1) );
+   dId_ = field::addToStorage< PdeField_T >( blocks, "d", getFineSize, real_t(0), field::fzyx, uint_t(1) );
+   zId_ = field::addToStorage< PdeField_T >( blocks, "z", getFineSize, real_t(0), field::fzyx, uint_t(1) );
 
    // Set up communication
    for ( uint_t lvl = 0; lvl < numLvl-1; ++lvl )
diff --git a/src/python_coupling/CMakeLists.txt b/src/python_coupling/CMakeLists.txt
index 08266d3a9a51a452ba7022de56994f935fa7861c..309a7cb9a451fe2b9d64347851334a158f7e6a63 100644
--- a/src/python_coupling/CMakeLists.txt
+++ b/src/python_coupling/CMakeLists.txt
@@ -1,7 +1,7 @@
  add_library( python_coupling )
  target_link_libraries( python_coupling PUBLIC pybind11::embed core communication domain_decomposition stencil field blockforest vtk )
- if( WALBERLA_BUILD_WITH_CUDA )
-  target_link_libraries( python_coupling PUBLIC cuda )
+ if( WALBERLA_BUILD_WITH_GPU_SUPPORT )
+  target_link_libraries( python_coupling PUBLIC gpu )
  endif()
  target_sources( python_coupling
        PRIVATE
diff --git a/src/python_coupling/export/CMakeLists.txt b/src/python_coupling/export/CMakeLists.txt
index bea431188570ec668ce21d5d50ebf2d372c81591..a7b38929727c181622e0615baf877fb1d38e1abf 100644
--- a/src/python_coupling/export/CMakeLists.txt
+++ b/src/python_coupling/export/CMakeLists.txt
@@ -2,7 +2,8 @@ target_sources( python_coupling
     PRIVATE
     GatherExport.impl.h
     VTKExport.cpp
-    CUDAExport.h
+    GPUExport.h
+    GPUExport.impl.h
     FieldCommunicationExport.impl.h
     BasicExport.cpp
     BlockForestCommunicationExport.h
@@ -16,5 +17,4 @@ target_sources( python_coupling
     BasicExport.h
     FieldExport.impl.h
     FieldExports.h
-    CUDAExport.impl.h     
     )
diff --git a/src/python_coupling/export/FieldExport.impl.h b/src/python_coupling/export/FieldExport.impl.h
index 55d2e9394ed4ca670a23b4415936d4c13bc9f75e..60ba3fd102d2c05cf252f9385e1a2f816f25eb33 100644
--- a/src/python_coupling/export/FieldExport.impl.h
+++ b/src/python_coupling/export/FieldExport.impl.h
@@ -631,7 +631,7 @@ void exportFields(py::module_& m)
       [](std::array< uint_t, 4 > size, py::object & dtype, uint_t ghostLayers, Layout layout, uint_t alignment) {
         return internal::createPythonField< FieldTypes... >(size, dtype, ghostLayers, layout, alignment);
       },
-      "size"_a, "dtype"_a, "ghostLayers"_a = uint_t(1), "layout"_a = zyxf, "alignment"_a = 0);
+      "size"_a, "dtype"_a, "ghostLayers"_a = uint_t(1), "layout"_a = fzyx, "alignment"_a = 0);
 
    m2.def(
       "addToStorage",
@@ -639,7 +639,7 @@ void exportFields(py::module_& m)
          Layout layout, uint_t ghostLayers, real_t initValue, uint_t alignment) {
          return internal::addToStorage< FieldTypes... >(blocks, name, dtype, fSize, ghostLayers, layout, initValue, alignment);
       },
-      "blocks"_a, "name"_a, "dtype"_a, "fSize"_a = 1, "layout"_a = zyxf, "ghostLayers"_a = uint_t(1), "initValue"_a = 0.0, "alignment"_a = 0);
+      "blocks"_a, "name"_a, "dtype"_a, "fSize"_a = 1, "layout"_a = fzyx, "ghostLayers"_a = uint_t(1), "initValue"_a = 0.0, "alignment"_a = 0);
 
    m2.def( "createVTKWriter",
            [](const shared_ptr<StructuredBlockForest> & blocks, const std::string & name,
diff --git a/src/python_coupling/export/CUDAExport.h b/src/python_coupling/export/GPUExport.h
similarity index 92%
rename from src/python_coupling/export/CUDAExport.h
rename to src/python_coupling/export/GPUExport.h
index 505aa3368e3008dec45eecf5acd9f3a231bc7b6e..6976a8c6ac51f41ad02ca4b66171cb568ae75e80 100644
--- a/src/python_coupling/export/CUDAExport.h
+++ b/src/python_coupling/export/GPUExport.h
@@ -13,8 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CUDAExport.h
-//! \ingroup cuda
+//! \file GPUExport.h
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
@@ -26,7 +26,8 @@
 
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
    template<typename... GpuFields>
@@ -36,9 +37,9 @@ namespace cuda {
    void exportCopyFunctionsToPython(py::module_ &m);
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
-#include "CUDAExport.impl.h"
+#include "GPUExport.impl.h"
 
 #endif //WALBERLA_BUILD_WITH_PYTHON
diff --git a/src/python_coupling/export/CUDAExport.impl.h b/src/python_coupling/export/GPUExport.impl.h
similarity index 92%
rename from src/python_coupling/export/CUDAExport.impl.h
rename to src/python_coupling/export/GPUExport.impl.h
index eb60759cbec17da1c81b52f80f31706068ea071f..cffbc245e985ba208b50569a2bfc3125f61c0e6a 100644
--- a/src/python_coupling/export/CUDAExport.impl.h
+++ b/src/python_coupling/export/GPUExport.impl.h
@@ -13,8 +13,8 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CUDAExport.impl.h
-//! \ingroup cuda
+//! \file GPUExport.impl.h
+//! \ingroup gpu
 //! \author Martin Bauer <martin.bauer@fau.de>
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
@@ -23,10 +23,10 @@
 // Do not reorder includes - the include order is important
 #include "core/logging/Logging.h"
 
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/communication/GPUPackInfo.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/communication/GPUPackInfo.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -35,7 +35,8 @@
 #include "python_coupling/helper/MplHelpers.h"
 
 namespace walberla {
-namespace cuda {
+namespace gpu
+{
 
 
 
@@ -70,7 +71,7 @@ using namespace pybind11::literals;
          typedef typename GpuField_T::value_type T;
          std::string data_type_name = field::internal::PythonFormatString<T>::get();
 
-         std::string class_name = "GpuField_" + data_type_name;
+         std::string const class_name = "GpuField_" + data_type_name;
          py::class_<GpuField_T, shared_ptr<GpuField_T>>(m_, class_name.c_str() )
             .def_property_readonly("layout",              &field::internal::field_layout            < GpuField_T > )
             .def_property_readonly("size",                &field::internal::field_size              < GpuField_T > )
@@ -89,11 +90,11 @@ using namespace pybind11::literals;
 
          using field::communication::PackInfo;
          using communication::GPUPackInfo;
-         std::string GpuFieldPackInfoName = "GpuFieldPackInfo_" + data_type_name;
+         std::string const GpuFieldPackInfoName = "GpuFieldPackInfo_" + data_type_name;
          py::class_< GPUPackInfo<GpuField_T>, shared_ptr< GPUPackInfo<GpuField_T> >, walberla::communication::UniformPackInfo>(m_, GpuFieldPackInfoName.c_str() );
 
          using field::communication::UniformMPIDatatypeInfo;
-         std::string GpuFieldMPIDataTypeInfoName = "GpuFieldMPIDataTypeInfo_" + data_type_name;
+         std::string const GpuFieldMPIDataTypeInfoName = "GpuFieldMPIDataTypeInfo_" + data_type_name;
          py::class_< UniformMPIDatatypeInfo<GpuField_T>, shared_ptr< UniformMPIDatatypeInfo<GpuField_T> >, walberla::communication::UniformMPIDatatypeInfo>(m_, GpuFieldMPIDataTypeInfoName.c_str() );
 
       }
@@ -167,7 +168,7 @@ using namespace pybind11::literals;
       template< typename GpuField_T>
       void operator() ( python_coupling::NonCopyableWrap<GpuField_T> )
       {
-         using cuda::communication::GPUPackInfo;
+         using gpu::communication::GPUPackInfo;
 
          IBlock * firstBlock =  & ( * blocks_->begin() );
          if( firstBlock->isDataClassOrSubclassOf<GpuField_T>(fieldId_) )
@@ -199,7 +200,7 @@ using namespace pybind11::literals;
    static py::object PackInfoWrapper(const shared_ptr<StructuredBlockForest> & blocks,
                                      const std::string & name, uint_t numberOfGhostLayers )
    {
-      using cuda::communication::GPUPackInfo;
+      using gpu::communication::GPUPackInfo;
       BlockDataID fieldID = python_coupling::blockDataIDFromString( *blocks, name );
 
       if ( blocks->begin() == blocks->end() ) {
@@ -296,15 +297,15 @@ class copyFieldToGpuDispatchExporter
    template< typename CpuField_T>
    void operator() ( python_coupling::NonCopyableWrap<CpuField_T> )
    {
-      typedef cuda::GPUField<typename CpuField_T::value_type> GpuField_T;
+      typedef gpu::GPUField<typename CpuField_T::value_type> GpuField_T;
       IBlock * firstBlock =  & ( * blocks_->begin() );
 
       if(firstBlock->isDataClassOrSubclassOf< CpuField_T > ( cpuFieldId_ ) )
       {
          if(toGPU_)
-           cuda::fieldCpy<GpuField_T, CpuField_T>(blocks_, gpuFieldId_, cpuFieldId_);
+               gpu::fieldCpy<GpuField_T, CpuField_T>(blocks_, gpuFieldId_, cpuFieldId_);
          else
-           cuda::fieldCpy<CpuField_T, GpuField_T>(blocks_, cpuFieldId_, gpuFieldId_);
+               gpu::fieldCpy<CpuField_T, GpuField_T>(blocks_, cpuFieldId_, gpuFieldId_);
       }
    }
  private:
@@ -321,8 +322,8 @@ void copyFieldToGPU(const shared_ptr< StructuredBlockForest > & blocks, const st
    namespace py = pybind11;
    auto result = make_shared<py::object>();
 
-   BlockDataID gpuFieldId = python_coupling::blockDataIDFromString( *blocks, gpuFieldName );
-   BlockDataID cpuFieldId = python_coupling::blockDataIDFromString( *blocks, cpuFieldName );
+   BlockDataID const gpuFieldId = python_coupling::blockDataIDFromString( *blocks, gpuFieldName );
+   BlockDataID const cpuFieldId = python_coupling::blockDataIDFromString( *blocks, cpuFieldName );
 
    copyFieldToGpuDispatchExporter exporter( blocks, gpuFieldId, cpuFieldId, toGPU );
    python_coupling::for_each_noncopyable_type<CpuFields...>( std::ref(exporter) );
@@ -335,7 +336,7 @@ using namespace pybind11::literals;
 template<typename... GpuFields>
 void exportModuleToPython(py::module_ &m)
 {
-   py::module_ m2 = m.def_submodule("cuda", "Cuda Extension of the waLBerla python bindings");
+   py::module_ m2 = m.def_submodule("gpu", "GPU (CUDA / HIP) Extension of the waLBerla python bindings");
 
    python_coupling::for_each_noncopyable_type<GpuFields...>( internal::GpuFieldExporter(m2) );
 
@@ -345,7 +346,7 @@ void exportModuleToPython(py::module_ &m)
          bool usePitchedMem, uint_t ghostLayers, Layout layout) {
         return internal::addToStorage<GpuFields...>(blocks, name, dtype, fSize, ghostLayers, layout, usePitchedMem);
       },
-      "blocks"_a, "name"_a, "dtype"_a, "fSize"_a=1, "usePitchedMem"_a=false, "ghostLayers"_a=uint(1), "layout"_a=zyxf);
+      "blocks"_a, "name"_a, "dtype"_a, "fSize"_a=1, "usePitchedMem"_a=false, "ghostLayers"_a=uint(1), "layout"_a=fzyx);
 
    m2.def(
       "createPackInfo",
@@ -368,7 +369,7 @@ void exportModuleToPython(py::module_ &m)
 template<typename... CpuFields >
 void exportCopyFunctionsToPython(py::module_ &m)
 {
-     py::module_ m2 = m.def_submodule("cuda", "Cuda Extension of the waLBerla python bindings");
+     py::module_ m2 = m.def_submodule("gpu", "GPU (CUDA / HIP) Extension of the waLBerla python bindings");
 
    m2.def(
       "copyFieldToGpu",
@@ -388,7 +389,7 @@ void exportCopyFunctionsToPython(py::module_ &m)
 
 
 
-} // namespace cuda
+} // namespace gpu
 } // namespace walberla
 
 
diff --git a/src/stencil/Directions.h b/src/stencil/Directions.h
index d3a75b812131878c4222d4cf4fa5ec5953e1f087..5be0d72223712c7cc8f4aa2b991bb9456f09aadb 100644
--- a/src/stencil/Directions.h
+++ b/src/stencil/Directions.h
@@ -9,9 +9,10 @@
 #pragma once
 
 // core includes
-#include "core/DataTypes.h"
 #include "core/cell/Cell.h"
+#include "core/DataTypes.h"
 #include "core/debug/Debug.h"
+#include "core/math/Vector3.h"
 
 // STL includes
 #include <string>
@@ -135,6 +136,39 @@ namespace stencil {
       }
    };
 
+
+   /// Maps a (x,y,z) direction vector to its direction \ingroup stencil
+   inline Direction vectorToDirection(cell_idx_t x, cell_idx_t y, cell_idx_t z){
+      static const Direction vecToDirArr[3][3][3] = {
+         {  // x = -1
+            {BSW, SW, TSW},   // y = -1
+            {BW, W, TW},      // y = 0
+            {BNW, NW, TNW}    // y = 1
+         },
+         {  // x = 0
+            {BS, S, TS},      // y = -1
+            {B, C, T},        // y = 0
+            {BN, N, TN}       // y = 1
+         },
+         {  // x = 1
+            {BSE, SE, TSE},   // y = -1
+            {BE, E, TE},      // y = 0
+            {BNE, NE, TNE}    // y = 1
+         }
+      };
+
+      return vecToDirArr[x + 1][y + 1][z + 1];
+   }
+
+   inline Direction vectorToDirection(Vector3< cell_idx_t > vec){
+      return vectorToDirection(vec[0], vec[1], vec[2]);
+   }
+
+   inline bool isFaceDirection(Direction dir) { return 1 <= dir && dir <= 6; }
+   inline bool isEdgeDirection(Direction dir) { return 7 <= dir && dir <= 18; }
+   inline bool isCornerDirection(Direction dir) { return 19 <= dir; }
+
+
    /// The x,y,z component for each normalized direction \ingroup stencil
    const real_t cNorm[3][NR_OF_DIRECTIONS] = {
       {
diff --git a/src/timeloop/CMakeLists.txt b/src/timeloop/CMakeLists.txt
index 9035c3d9ab1a6620f39b1a5e38295e9704ae3d0d..ba2ef178c97a7144d5dd5f7a06c5ad76dad763b3 100644
--- a/src/timeloop/CMakeLists.txt
+++ b/src/timeloop/CMakeLists.txt
@@ -14,8 +14,6 @@ target_sources( timeloop
       SweepTimeloop.h
       Timeloop.h
       PerformanceMeter.cpp
-      SweepTimeloop.cpp
-      Timeloop.cpp
 )
 
 ###################################################################################################
diff --git a/src/timeloop/SelectableFunctionCreators.h b/src/timeloop/SelectableFunctionCreators.h
index b014f44deec75c414387fb4f68df428091db175b..b877b293e6e7c62aa3917b560ebbf430770ba8e9 100644
--- a/src/timeloop/SelectableFunctionCreators.h
+++ b/src/timeloop/SelectableFunctionCreators.h
@@ -186,7 +186,7 @@ namespace timeloop {
 
 
    private:
-      friend class SweepTimeloop;
+      template < typename TimingPolicy > friend class SweepTimeloop;
 
       BlockStorage & bs_;
 
diff --git a/src/timeloop/SweepTimeloop.h b/src/timeloop/SweepTimeloop.h
index 4ffc8df9e0937ff9b5dd5a2cbcea0b4407078c2b..19e9344a68e30b1efb2c090f4d8a588509cd9b8d 100644
--- a/src/timeloop/SweepTimeloop.h
+++ b/src/timeloop/SweepTimeloop.h
@@ -112,7 +112,8 @@ namespace timeloop {
     * \ingroup timeloop
     */
    //*******************************************************************************************************************
-   class SweepTimeloop : public Timeloop
+   template < typename TP = timing::WcPolicy>
+   class SweepTimeloop : public Timeloop<TP>
    {
    public:
 
@@ -121,11 +122,11 @@ namespace timeloop {
       //@{
 
       SweepTimeloop( BlockStorage & blockStorage, uint_t nrOfTimeSteps )
-         : Timeloop(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true)
+         : Timeloop<TP>(nrOfTimeSteps), blockStorage_(blockStorage), nextId_(0),firstRun_(true)
       {}
 
       SweepTimeloop( const shared_ptr<StructuredBlockStorage> & structuredBlockStorage, uint_t nrOfTimeSteps )
-         : Timeloop(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ),
+         : Timeloop<TP>(nrOfTimeSteps), blockStorage_( structuredBlockStorage->getBlockStorage() ),
            nextId_(0), firstRun_(true)
       {}
 
@@ -167,7 +168,7 @@ namespace timeloop {
       }
 
       void doTimeStep(const Set<SUID> &selectors) override;
-      void doTimeStep(const Set<SUID> &selectors, WcTimingPool &tp) override;
+      void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &tp) override;
 
       uint_t nextId_;
       std::vector<uint_t> sweepsToDelete_;
@@ -180,7 +181,7 @@ namespace timeloop {
 } // namespace timeloop
 } // namespace walberla
 
-
+#include "SweepTimeloop.impl.h"
 
 //======================================================================================================================
 //
@@ -189,7 +190,8 @@ namespace timeloop {
 //======================================================================================================================
 
 namespace walberla {
-   using timeloop::SweepTimeloop;
+   using SweepTimeloop = typename timeloop::SweepTimeloop < >;
+   using DeviceSynchronizeSweepTimeloop = typename timeloop::SweepTimeloop < timing::DeviceSynchronizePolicy >;
 
    using timeloop::Sweep;
    using timeloop::SweepOnBlock;
diff --git a/src/timeloop/SweepTimeloop.cpp b/src/timeloop/SweepTimeloop.impl.h
similarity index 84%
rename from src/timeloop/SweepTimeloop.cpp
rename to src/timeloop/SweepTimeloop.impl.h
index 6064efa27af1dce8a8b435132f961325759aa8a1..481ddbacad80e5d167bcd6d2925c51d6d48db400 100644
--- a/src/timeloop/SweepTimeloop.cpp
+++ b/src/timeloop/SweepTimeloop.impl.h
@@ -32,8 +32,8 @@ namespace timeloop {
 //////////////////////////   Execution of Timeloop  ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
+template < typename TP >
+void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors)
 {
    removeForDeletionMarkedSweeps();
    //iterate over all registered sweeps
@@ -43,7 +43,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
 
       //select and execute before functions
       for( size_t j=0; j < s.beforeFuncs.size(); ++j )
-         executeSelectable(s.beforeFuncs[j].selectableFunc_,selectors,"Pre-Sweep Function");
+         this->executeSelectable(s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function");
 
       // Loop over all blocks
       for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi )
@@ -52,11 +52,11 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
          if( s.sweep.empty() )
          {
             WALBERLA_ABORT("Selecting Sweep " << sweepIt->first << ": " <<
-                           "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" );
+                           "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" )
          }
 
          // ensure that exactly one sweep has been registered that matches the specified selectors
-         size_t numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState());
+         size_t const numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState());
 
          if (numSweeps == size_t(0)) {
             continue;
@@ -73,7 +73,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
          {
             std::string sweepName;
             s.sweep.getUnique( selectors + bi->getState(), sweepName );
-            WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() );
+            WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() )
          }
 
          (selectedSweep->function_)( bi.get() );
@@ -81,11 +81,12 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors)
 
       // select and execute after functions
       for( size_t j=0; j < s.afterFuncs.size(); ++j )
-         executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function");
+         this->executeSelectable(s.afterFuncs[j].selectableFunc_, selectors, "Post-Sweep Function");
    }
 }
 
-void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
+template < typename TP >
+void SweepTimeloop<TP>::doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing)
 {
    removeForDeletionMarkedSweeps();
    // On first run we extract all possible names of sweeps, independent of selectors
@@ -113,7 +114,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
 
       //select and execute before functions
       for( size_t j=0; j < s.beforeFuncs.size(); ++j )
-         executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing );
+         this->executeSelectable( s.beforeFuncs[j].selectableFunc_, selectors, "Pre-Sweep Function", timing );
 
       for( BlockStorage::iterator bi = blockStorage_.begin(); bi != blockStorage_.end(); ++bi )
       {
@@ -121,11 +122,11 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
          if( s.sweep.empty() )
          {
             WALBERLA_ABORT("Selecting Sweep " << sweepIt->first << ": " <<
-                           "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" );
+                           "No sweep has been registered! Did you only register a BeforeFunction or AfterFunction?" )
          }
 
          // ensure that exactly one sweep has been registered that matches the specified selectors
-         size_t numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState());
+         size_t const numSweeps = s.sweep.getNumberOfMatching(selectors + bi->getState());
 
          if (numSweeps == size_t(0)) {
             continue;
@@ -139,7 +140,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
          std::string sweepName;
          Sweep * selectedSweep = s.sweep.getUnique( selectors + bi->getState(), sweepName );
 
-         WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() );
+         WALBERLA_LOG_PROGRESS("Running sweep \"" << sweepName << "\" on block " << bi->getId() )
 
          // loop over all blocks
          timing[sweepName].start();
@@ -149,7 +150,7 @@ void SweepTimeloop::doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing)
 
       // select and execute after functions
       for( size_t j=0; j < s.afterFuncs.size(); ++j )
-         executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing );
+         this->executeSelectable(s.afterFuncs[j].selectableFunc_,selectors,"Post-Sweep Function", timing );
    }
 }
 
diff --git a/src/timeloop/Timeloop.h b/src/timeloop/Timeloop.h
index 48b7de6ce88342e04c7aac2472f09fd6b566773f..faed83b06baf774c34247cd4ee6483510b3d4a78 100644
--- a/src/timeloop/Timeloop.h
+++ b/src/timeloop/Timeloop.h
@@ -48,6 +48,8 @@ using VoidFctNoArguments = std::function<void ()>;
 * \ingroup timeloop
 */
 //*******************************************************************************************************************
+
+template < typename TP = timing::WcPolicy >
 class Timeloop : public ITimeloop
 {
 private:
@@ -109,7 +111,10 @@ public:
    //**Construction & Destruction************************************************************************************
    /*! \name Construction & Destruction */
    //@{
-   Timeloop( uint_t nrOfTimeSteps );
+   Timeloop( uint_t nrOfTimeSteps )
+      : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false )
+   {
+   }
 
    ~Timeloop() override = default;
    //@}
@@ -121,17 +126,17 @@ public:
    //@{
    void run() override                  { run(true); }
    void run( const bool logTimeStep );
-   void run( WcTimingPool & timing, const bool logTimeStep = true );
+   void run( timing::TimingPool<TP> & timing, const bool logTimeStep = true );
 
    void singleStep() override { singleStep(true); }
    void singleStep( const bool logTimeStep );
-   void singleStep( WcTimingPool & timing, const bool logTimeStep = true );
+   void singleStep( timing::TimingPool<TP> & timing, const bool logTimeStep = true );
 
    void stop() override;
    void synchronizedStop( bool stop ) override;
 
     void setCurrentTimeStepToZero()     { curTimeStep_ = 0;  }
-    void setCurrentTimeStep( uint_t ts) override { curTimeStep_ = ts; }
+    void setCurrentTimeStep( uint_t ts ) override { curTimeStep_ = ts; }
 
     //@}
    //****************************************************************************************************************
@@ -183,7 +188,7 @@ public:
 protected:
 
    virtual void doTimeStep(const Set<SUID> &selectors) = 0;
-   virtual void doTimeStep(const Set<SUID> &selectors, WcTimingPool &timing) = 0;
+   virtual void doTimeStep(const Set<SUID> &selectors, timing::TimingPool<TP> &timing) = 0;
 
 
    void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
@@ -192,7 +197,7 @@ protected:
    void executeSelectable(const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                           const Set<SUID> & selector,
                           const std::string & what,
-                          WcTimingPool & tp);
+                          timing::TimingPool<TP> & tp);
 
 
    uint_t curTimeStep_;   ///< current time step
@@ -210,6 +215,8 @@ protected:
 } // namespace timeloop
 } // namespace walberla
 
+#include "Timeloop.impl.h"
+
 
 
 //======================================================================================================================
@@ -219,6 +226,7 @@ protected:
 //======================================================================================================================
 
 namespace walberla {
-   using timeloop::Timeloop;
+   using Timeloop = typename timeloop::Timeloop < >;
+   using DeviceSynchronizeTimeloop = typename timeloop::Timeloop < timing::DeviceSynchronizePolicy >;
 }
 
diff --git a/src/timeloop/Timeloop.cpp b/src/timeloop/Timeloop.impl.h
similarity index 77%
rename from src/timeloop/Timeloop.cpp
rename to src/timeloop/Timeloop.impl.h
index 6b2f548d54ec9922200488243eec2355e3a9f676..832f1c7adcdfdb8d9483508898e2b7e6b22a38fb 100644
--- a/src/timeloop/Timeloop.cpp
+++ b/src/timeloop/Timeloop.impl.h
@@ -29,18 +29,10 @@
 namespace walberla {
 namespace timeloop {
 
-
-Timeloop::Timeloop( uint_t nrOfTimeSteps)
-   : curTimeStep_(0), nrOfTimeSteps_(nrOfTimeSteps), stop_( false )
-{
-}
-
-
-
-
-void Timeloop::run( const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::run( const bool logTimeStep )
 {
-   WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" );
+   WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" )
    while(curTimeStep_ < nrOfTimeSteps_) {
       singleStep( logTimeStep );
       if ( stop_ ) {
@@ -48,12 +40,13 @@ void Timeloop::run( const bool logTimeStep )
          break;
       }
    }
-   WALBERLA_LOG_PROGRESS( "Timeloop finished" );
+   WALBERLA_LOG_PROGRESS( "Timeloop finished" )
 }
 
-void Timeloop::run( WcTimingPool & tp, const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::run(timing::TimingPool<TP> & tp, const bool logTimeStep )
 {
-   WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" );
+   WALBERLA_LOG_PROGRESS( "Running timeloop for " << nrOfTimeSteps_ << " time steps" )
 
    while(curTimeStep_ < nrOfTimeSteps_) {
       singleStep( tp, logTimeStep );
@@ -63,7 +56,7 @@ void Timeloop::run( WcTimingPool & tp, const bool logTimeStep )
       }
    }
 
-   WALBERLA_LOG_PROGRESS( "Timeloop finished" );
+   WALBERLA_LOG_PROGRESS( "Timeloop finished" )
 }
 
 //*******************************************************************************************************************
@@ -74,7 +67,9 @@ void Timeloop::run( WcTimingPool & tp, const bool logTimeStep )
 *  before reaching nrOfTimeSteps
 */
 //*******************************************************************************************************************
-void Timeloop::stop()
+
+template < typename TP >
+void Timeloop<TP>::stop()
 {
    stop_ = true;
 }
@@ -89,17 +84,20 @@ void Timeloop::stop()
 *     -> If at least on process calls synchronizedStop(true) the timeloop is stopped
 */
 //*******************************************************************************************************************
-void Timeloop::synchronizedStop( bool stopVal )
+
+template < typename TP >
+void Timeloop<TP>::synchronizedStop( bool stopVal )
 {
    stop_ = stopVal;
    mpi::allReduceInplace( stop_, mpi::LOGICAL_OR );
 }
 
-void Timeloop::singleStep( const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::singleStep( const bool logTimeStep )
 {
-   LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep );
+   LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep );
 
-   WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ );
+   WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ )
 
    for(size_t i=0; i<beforeFunctions_.size(); ++i )
       executeSelectable( beforeFunctions_[i], uid::globalState(), "Pre-Timestep Function" );
@@ -112,11 +110,12 @@ void Timeloop::singleStep( const bool logTimeStep )
    ++curTimeStep_;
 }
 
-void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep )
+template < typename TP >
+void Timeloop<TP>::singleStep( timing::TimingPool<TP> & tp, const bool logTimeStep )
 {
-   LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep );
+   LoggingStampManager const raii( make_shared<LoggingStamp>( *this ), logTimeStep );
 
-   WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ );
+   WALBERLA_LOG_PROGRESS( "Running time step " << curTimeStep_ )
 
    for(size_t i=0; i<beforeFunctions_.size(); ++i )
       executeSelectable( beforeFunctions_[i], uid::globalState(), "Pre-Timestep Function", tp );
@@ -133,47 +132,47 @@ void Timeloop::singleStep( WcTimingPool & tp, const bool logTimeStep )
 //////////////////////////////////////////   Registering Functions   ///////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-Timeloop::FctHandle
-Timeloop::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id,
+template < typename TP >
+typename Timeloop<TP>::FctHandle
+Timeloop<TP>::addFuncBeforeTimeStep(const VoidFctNoArguments& f, const std::string & id,
                                 const Set<SUID> & r, const Set<SUID> & e )
 {
     beforeFunctions_.emplace_back(f,r,e,id );
     return beforeFunctions_.size() - 1;
 }
 
-
-void Timeloop::addFuncBeforeTimeStep(const Timeloop::FctHandle & h,
+template < typename TP >
+void Timeloop<TP>::addFuncBeforeTimeStep(const Timeloop::FctHandle & h,
                                      const VoidFctNoArguments& f, const std::string & id,
                                      const Set<SUID>&r, const Set<SUID> & e )
 {
-   WALBERLA_ASSERT_LESS( h, beforeFunctions_.size() ); //invalid FctHandle
+   WALBERLA_ASSERT_LESS( h, beforeFunctions_.size() ) //invalid FctHandle
    beforeFunctions_[h].add(f,r,e,id);
 }
 
 
-
-Timeloop::FctHandle
-Timeloop::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id,
+template < typename TP >
+typename Timeloop<TP>::FctHandle
+Timeloop<TP>::addFuncAfterTimeStep(const VoidFctNoArguments& f, const std::string & id,
                                       const Set<SUID> & r, const Set<SUID> & e )
 {
     afterFunctions_.emplace_back(f,r,e,id );
     return afterFunctions_.size() - 1;
 }
 
-
-void Timeloop::addFuncAfterTimeStep(const Timeloop::FctHandle & h,
+template < typename TP >
+void Timeloop<TP>::addFuncAfterTimeStep(const Timeloop::FctHandle & h,
                                            const VoidFctNoArguments& f, const std::string & id,
                                            const Set<SUID>&r, const Set<SUID> & e )
 {
-   WALBERLA_ASSERT_LESS( h, afterFunctions_.size() ); //invalid FctHandle
+   WALBERLA_ASSERT_LESS( h, afterFunctions_.size() ) //invalid FctHandle
    afterFunctions_[h].add(f,r,e,id);
 }
 
 
 
-
-void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
+template < typename TP >
+void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                                   const Set<SUID> & selector,
                                   const std::string & what )
 {
@@ -182,20 +181,21 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN
    if( exe == nullptr )
       WALBERLA_ABORT( "Trying to selecting " << what << ": "
                       << "Multiple Matches found! Check your selector " << selector << std::endl
-                      << "All registered objects: " << std::endl << selectable << std::endl );
+                      << "All registered objects: " << std::endl << selectable << std::endl )
 
 
-   WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" );
+   WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" )
 
    LIKWID_MARKER_START( objectName.c_str() );
    (*exe)();
    LIKWID_MARKER_STOP( objectName.c_str() );
 }
 
-void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
+template < typename TP >
+void Timeloop<TP>::executeSelectable( const selectable::SetSelectableObject<VoidFctNoArguments,SUID> & selectable,
                                   const Set<SUID> & selector,
                                   const std::string & what,
-                                  WcTimingPool & timing )
+                                  timing::TimingPool<TP> & timing )
 {
    std::string objectName;
    const VoidFctNoArguments * exe = selectable.getUnique( selector, objectName );
@@ -203,9 +203,9 @@ void Timeloop::executeSelectable( const selectable::SetSelectableObject<VoidFctN
    if( !exe)
       WALBERLA_ABORT( "Trying to select " << what << ": "
                       << "Multiple or no matches found! Check your selector " << selector << std::endl
-                      << "All registered objects: " << std::endl << selectable << std::endl );
+                      << "All registered objects: " << std::endl << selectable << std::endl )
 
-   WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" );
+   WALBERLA_LOG_PROGRESS("Running " << what << " \"" << objectName << "\"" )
 
    timing[objectName].start();
    LIKWID_MARKER_START( objectName.c_str() );
diff --git a/src/waLBerlaDefinitions.in.h b/src/waLBerlaDefinitions.in.h
index a9622b1e620aa2a98a91b9d9563b9bf4b89c63a5..3fddd25444efca07d819a2ce553331c7b96841b7 100644
--- a/src/waLBerlaDefinitions.in.h
+++ b/src/waLBerlaDefinitions.in.h
@@ -13,17 +13,20 @@
 // double or single precision
 #cmakedefine WALBERLA_DOUBLE_ACCURACY
 
+// Experimental half precision support.
+#cmakedefine WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
 
 // Debugging options
 #cmakedefine WALBERLA_ENABLE_GUI
+#cmakedefine WALBERLA_BUILD_WITH_FASTMATH
 
 
 // External libraries
-#cmakedefine WALBERLA_BUILD_WITH_BOOST
 #cmakedefine WALBERLA_BUILD_WITH_MPI
 #cmakedefine WALBERLA_BUILD_WITH_OPENMP
 #cmakedefine WALBERLA_BUILD_WITH_METIS
 #cmakedefine WALBERLA_BUILD_WITH_PARMETIS
+#cmakedefine WALBERLA_BUILD_WITH_LIKWID_MARKERS
 
 #cmakedefine WALBERLA_BUILD_WITH_PYTHON
 
@@ -33,6 +36,8 @@
 #cmakedefine WALBERLA_MESAPD_CONVEX_POLYHEDRON_AVAILABLE
 
 #cmakedefine WALBERLA_BUILD_WITH_CUDA
+#cmakedefine WALBERLA_BUILD_WITH_HIP
+#cmakedefine WALBERLA_BUILD_WITH_GPU_SUPPORT
 
 #cmakedefine WALBERLA_BUILD_WITH_CODEGEN
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b7032214d314cae434db0a84da4eace9bdf30c52..b16438de039b01b03a062b79d3589642491416b5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,7 +5,7 @@ include_directories( ${walberla_BINARY_DIR}/src ) # for generated headers
 add_subdirectory( blockforest )
 add_subdirectory( boundary )
 add_subdirectory( core )
-add_subdirectory( cuda )
+add_subdirectory( gpu )
 add_subdirectory( domain_decomposition )
 add_subdirectory( executiontree )
 add_subdirectory( fft )
@@ -14,6 +14,7 @@ add_subdirectory( gather )
 add_subdirectory( geometry )
 add_subdirectory( gui )
 add_subdirectory( lbm )
+add_subdirectory( lbm_generated )
 add_subdirectory( lbm_mesapd_coupling )
 add_subdirectory( mesa_pd )
 add_subdirectory( mesh )
diff --git a/tests/blockforest/BlockDataIOTest.cpp b/tests/blockforest/BlockDataIOTest.cpp
index 90c4360966c4ac7c31595d6eca1f079e066a8360..b72a9dded44f10062ab290fe758dd93199df53c3 100644
--- a/tests/blockforest/BlockDataIOTest.cpp
+++ b/tests/blockforest/BlockDataIOTest.cpp
@@ -79,9 +79,9 @@ void test()
    blockforest::BlockForestEvaluation evaluation(sbf->getBlockForest());
    WALBERLA_LOG_INFO_ON_ROOT("BlockForest:\n" << evaluation.toString())
 
-   // auto originalFieldId = field::addToStorage< FieldType >( sbf, "OriginalField", 0.0, field::zyxf, uint_t(3), false,
+   // auto originalFieldId = field::addToStorage< FieldType >( sbf, "OriginalField", 0.0, field::fzyx, uint_t(3), false,
    // None, Empty );
-   auto dataHandling    = make_shared< field::DefaultBlockDataHandling< FieldType > >(sbf, uint_t(3), 0.0, field::zyxf);
+   auto dataHandling    = make_shared< field::DefaultBlockDataHandling< FieldType > >(sbf, uint_t(3), 0.0, field::fzyx);
    auto originalFieldId = sbf->addBlockData(dataHandling, "OriginalField", None, Empty);
 
    math::seedRandomGenerator(numeric_cast< std::mt19937::result_type >(MPIManager::instance()->rank()));
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 70e14368e1d854a4ed7189cfc394ad7838361a4f..46b98eb48c90884feda14e4dde15341b4b5f3687 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -21,11 +21,6 @@ waLBerla_execute_test( NAME CellIntervalTest )
 waLBerla_compile_test( FILES config/ConfigTest.cpp )
 waLBerla_execute_test( NAME ConfigTest COMMAND $<TARGET_FILE:ConfigTest> ${CMAKE_CURRENT_SOURCE_DIR}/config/ConfigTest.dat )
 
-if( WALBERLA_BUILD_WITH_BOOST )
-   waLBerla_compile_test( FILES config/MultiArrayIOTest.cpp )
-   waLBerla_execute_test( NAME MultiArrayIOTest COMMAND $<TARGET_FILE:MultiArrayIOTest> )
-endif( WALBERLA_BUILD_WITH_BOOST )
-
 #########
 # debug #
 #########
@@ -81,26 +76,10 @@ waLBerla_execute_test( NAME Matrix3Test )
 waLBerla_compile_test( FILES math/GenericAABBTest.cpp DEPENDS stencil domain_decomposition )
 waLBerla_execute_test( NAME GenericAABBTest )
 
-if( WALBERLA_BUILD_WITH_BOOST )
-   waLBerla_compile_test( FILES math/PhysicalCheckTest.cpp DEPENDS stencil )
-   waLBerla_execute_test( NAME PhysicalCheckTest
-         COMMAND $<TARGET_FILE:PhysicalCheckTest> ${CMAKE_CURRENT_SOURCE_DIR}/math/PhysicalCheckTestInput.prm
-         DEPENDS_ON_TARGETS PhysicalCheckTest )
-endif( WALBERLA_BUILD_WITH_BOOST )
-
 
 waLBerla_compile_test( FILES math/FastInvSqrtTest.cpp )
 waLBerla_execute_test( NAME FastInvSqrtTest )
 
-########################
-# math/equation_system #
-########################
-
-if( WALBERLA_BUILD_WITH_BOOST )
-   waLBerla_compile_test( FILES math/equation_system/EquationSolverTest.cpp )
-   waLBerla_execute_test( NAME EquationSolverTest )
-endif( WALBERLA_BUILD_WITH_BOOST )
-
 #######
 # mpi #
 #######
@@ -193,6 +172,9 @@ waLBerla_compile_test( FILES DebugSTLTest.cpp )
 waLBerla_execute_test( NAME DebugSTLTest )
 set_tests_properties(DebugSTLTest PROPERTIES WILL_FAIL TRUE)
 
+waLBerla_compile_test( FILES FP16Test.cpp )
+waLBerla_execute_test( NAME FP16Test )
+
 waLBerla_compile_test( FILES FunctionTraitsTest.cpp )
 waLBerla_execute_test( NAME FunctionTraitsTest )
 
diff --git a/tests/core/FP16Test.cpp b/tests/core/FP16Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60a2be0eeee0872449f6a648fa1c65abbbda7f42
--- /dev/null
+++ b/tests/core/FP16Test.cpp
@@ -0,0 +1,85 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FP16Test.cpp
+//! \ingroup core
+//! \author Nils Kohl <nils.kohl@fau.de>
+//
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/Environment.h"
+
+#include <cstdlib>
+#include <iostream>
+
+namespace walberla {
+
+void fp16Test( int argc, char ** argv )
+{
+   Environment const env( argc, argv );
+
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+   WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ")
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+
+#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   WALBERLA_LOG_INFO_ON_ROOT(" - Test does nothing as it was not built with fp16 support.")
+   WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.")
+   WALBERLA_LOG_INFO_ON_ROOT(" - Reconfigure by setting the respective CMake variable "
+                             "(at the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT) "
+                             "to ON.")
+#else
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ")
+   auto sfloat64 = sizeof(float64);
+   auto sfloat32 = sizeof(float32);
+   auto sfloat16 = sizeof(float16);
+   WALBERLA_CHECK_EQUAL( sfloat64, 8, "Your types don't seem to have the expected sizes." );
+   WALBERLA_CHECK_EQUAL( sfloat32, 4, "Your types don't seem to have the expected sizes." );
+   WALBERLA_CHECK_EQUAL( sfloat16, 2, "Your types don't seem to have the expected sizes." );
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ")
+   const float64 a64 = 42;
+   const float32 a32 = 42;
+   const float16 a16 = 42;
+   WALBERLA_LOG_INFO_ON_ROOT("   + float64: " << a64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float32: " << a32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float16: " << (double) a16)
+   WALBERLA_LOG_INFO_ON_ROOT("   Casting and output compiles.")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ")
+   const float16 x = 1.2f16;
+   const float16 y = -1.8f16;
+   const float64 z = -0.6;
+   WALBERLA_LOG_INFO_ON_ROOT("   + " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << " ? ")
+   WALBERLA_CHECK_FLOAT_EQUAL((float64) (x + y), z, "float16 addition does not work correctly.");
+#endif
+}
+
+}
+
+
+int main( int argc, char** argv )
+{
+   walberla::debug::enterTestMode();
+   walberla::fp16Test( argc, argv );
+   return EXIT_SUCCESS;
+}
diff --git a/tests/core/FunctionTraitsTest.cpp b/tests/core/FunctionTraitsTest.cpp
index 8c378eceaa7f16bf08a8400f31a35ffa028f6a9d..dc503f2db3df10f911fe2df14d7accbe0a800524 100644
--- a/tests/core/FunctionTraitsTest.cpp
+++ b/tests/core/FunctionTraitsTest.cpp
@@ -25,7 +25,7 @@
 
 using namespace walberla;
 
-// FunctionTraits are used in a similar way in cuda/Kernel.h. As explained below, special attention is required.
+// FunctionTraits are used in a similar way in gpu/Kernel.h. As explained below, special attention is required.
 template< typename F>
 struct SomeClass
 {
diff --git a/tests/core/config/MultiArrayIOTest.cpp b/tests/core/config/MultiArrayIOTest.cpp
deleted file mode 100644
index 515b1071b12a12f8ea93838d44253d6a3c557e89..0000000000000000000000000000000000000000
--- a/tests/core/config/MultiArrayIOTest.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file MultiArrayIOTest.h
-//! \author Martin Bauer <martin.bauer@fau.de>
-//
-//======================================================================================================================
-
-
-#include "core/MultiArrayIO.h"
-#include "core/debug/TestSubsystem.h"
-#include "core/Environment.h"
-
-#include <sstream>
-#include <iostream>
-
-using namespace walberla;
-
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   walberla::Environment walberlaEnv( argc, argv );
-
-   using namespace std;
-
-   string test1 = "[[ 0.2   0.24  0.2   0.17]\n[ 0.24  0.2   0.2   0.2 ]\n\t[ 0.2   0.2   0.2   0.2 ]\n[ 0.17  0.2   0.2   0.2 ]]";
-   stringstream ss1 ( test1 );
-   //string betweenBrackets;
-   //WALBERLA_CHECK( readContentBetweenBrackets( ss1, betweenBrackets  ) );
-   //cout << betweenBrackets;
-
-   boost::multi_array<real_t,2> arr1;
-   bool res = !( ss1 >> arr1 ).fail();
-   WALBERLA_CHECK( res );
-   cout << arr1 << endl;
-
-
-   string test2 = " [1 2 3,4,5,6,7 ]";
-   stringstream ss2 ( test2 );
-
-   boost::multi_array<real_t,1> arr2;
-   bool res2 = !(ss2 >> arr2).fail() ;
-   WALBERLA_CHECK( res2 );
-   cout << arr2 << endl;
-
-
-   return 0;
-}
\ No newline at end of file
diff --git a/tests/core/load_balancing/MetisTest.cpp b/tests/core/load_balancing/MetisTest.cpp
index 98d13eeb68f65c016a321a084b4741fecdc1065d..d8f113f929268ce2239943cca44e6b33aada0d3d 100644
--- a/tests/core/load_balancing/MetisTest.cpp
+++ b/tests/core/load_balancing/MetisTest.cpp
@@ -77,8 +77,8 @@ int main( int argc, char * argv[] )
 
    typedef field::GhostLayerField< int64_t, 1 > FieldType;
 
-   auto domainId    = field::addToStorage< FieldType >( blocks, "domain", int64_t(-1), field::zyxf, uint_t(1) );
-   auto partFieldId = field::addToStorage< FieldType >( blocks, "partitions", int64_t(-1), field::zyxf, uint_t(1) );
+   auto domainId    = field::addToStorage< FieldType >( blocks, "domain", int64_t(-1), field::fzyx, uint_t(1) );
+   auto partFieldId = field::addToStorage< FieldType >( blocks, "partitions", int64_t(-1), field::fzyx, uint_t(1) );
 
    auto & domain    = *( blocks->begin()->getData< FieldType >( domainId    ) );
    auto & partField = *( blocks->begin()->getData< FieldType >( partFieldId ) );
diff --git a/tests/core/load_balancing/ParMetisTest.cpp b/tests/core/load_balancing/ParMetisTest.cpp
index a02634c839cf77de7190ca2794696e7863f9bf59..b0d9f578e281e3d17ee0c4e6308664ec8013822d 100644
--- a/tests/core/load_balancing/ParMetisTest.cpp
+++ b/tests/core/load_balancing/ParMetisTest.cpp
@@ -82,8 +82,8 @@ int main( int argc, char * argv[] )
 
    typedef field::GhostLayerField< int64_t, 1 > FieldType;
 
-   auto domainId    = field::addToStorage< FieldType >( blocks, "domain", int64_t(-1), field::zyxf, uint_t(1) );
-   auto partFieldId = field::addToStorage< FieldType >( blocks, "partitions", int64_t(-1), field::zyxf, uint_t(1) );
+   auto domainId    = field::addToStorage< FieldType >( blocks, "domain", int64_t(-1), field::fzyx, uint_t(1) );
+   auto partFieldId = field::addToStorage< FieldType >( blocks, "partitions", int64_t(-1), field::fzyx, uint_t(1) );
 
    auto & domain    = *( blocks->begin()->getData< FieldType >( domainId    ) );
    auto & partField = *( blocks->begin()->getData< FieldType >( partFieldId ) );
diff --git a/tests/core/math/PhysicalCheckTest.cpp b/tests/core/math/PhysicalCheckTest.cpp
deleted file mode 100644
index d92517723561bff7dfe590450ad8d771c2531136..0000000000000000000000000000000000000000
--- a/tests/core/math/PhysicalCheckTest.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file PhysicalCheckTest.cpp
-//! \ingroup core
-//! \author David Staubach <david.staubach@fau.de>
-//
-//======================================================================================================================
-
-#include "core/Abort.h"
-#include "core/DataTypes.h"
-#include "core/Environment.h"
-#include "core/debug/TestSubsystem.h"
-#include "core/logging/Logging.h"
-#include "core/math/PhysicalCheck.h"
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-
-using namespace walberla;
-using namespace walberla::math;
-
-
-int testPhysicalCheck1( shared_ptr<Config> & config )
-{
-   Config::BlockHandle pcConfigBlock = config->getBlock( "Physical_Check" );
-
-   if( !pcConfigBlock )
-      WALBERLA_ABORT( "You have to specify a \"Physical_Check\" block in the configuration file!" );
-
-   PhysicalCheck pc( pcConfigBlock );
-
-   WALBERLA_CHECK( pc.isDefined(std::string("parameter1")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("parameter2")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("var1")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("var2")) );
-   // TODO: further checks
-
-   // Check for functionality is within the function
-   pc.completeConfig( config );
-
-   return 0;
-}
-
-int testPhysicalCheck2( shared_ptr<Config> & config )
-{
-   Config::BlockHandle pcConfigBlock = config->getBlock( "Physical_Check" );
-
-   if( !pcConfigBlock )
-      WALBERLA_ABORT( "You have to specify a \"Physical_Check\" block in the configuration file!" );
-
-   PhysicalCheck pc;
-
-   pc.addBlock(pcConfigBlock);
-
-   WALBERLA_CHECK( pc.isDefined(std::string("parameter1")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("parameter2")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("var1")) );
-   WALBERLA_CHECK( pc.isDefined(std::string("var2")) );
-   // TODO: further checks
-
-   // Check for functionality is within the function
-   pc.completeConfig( config );
-
-   return 0;
-}
-
-int main( int argc, char** argv )
-{
-   walberla::Environment env( argc, argv );
-
-   debug::enterTestMode();
-
-   try {
-      shared_ptr<Config> config = env.config();
-
-      int value;
-      value = testPhysicalCheck1( config );
-
-      value = testPhysicalCheck2( config );
-
-      return value;
-   }
-   catch( std::exception & e )
-   {
-      WALBERLA_LOG_INFO( "Unhandled exception raised: " << e.what() );
-   }
-}
diff --git a/tests/core/math/PhysicalCheckTestInput.prm b/tests/core/math/PhysicalCheckTestInput.prm
deleted file mode 100644
index d93526bcedab10fdacfd65d948aa4e6961b65b56..0000000000000000000000000000000000000000
--- a/tests/core/math/PhysicalCheckTestInput.prm
+++ /dev/null
@@ -1,48 +0,0 @@
-//GlobalState FZYX,SRTD3Q19LevelBased;
-//SRTD3Q19LevelBased, MRTD3Q19OriginalLevelBased, MRTD3Q19ExtendedLevelBased
-
-Physical_Check {
-
-   Equations {
-      eq0 parameter1 = 23;
-      eq1 parameter2 = 42;
-      eq2 parameter3 = -11;
-
-      eq3 var1 = parameter1 + sqrt(parameter2);
-      eq4 var2 = parameter2 - ln(23);
-      eq5 var3 = parameter3 * exp(var2);
-      
-      eq6 dx = 2;
-      eq7 dt = 1;
-      eq8 rho = 3;
-   }
-
-   Units {
-      parameter1 m;
-      parameter2 m^2/s;
-      parameter3 1/Akg;
-   }
-   
-   Constraints {
-      co0 var1 > 20;
-      co2 var3 <= -209;
-      co3 parameter1 >= 23;
-      co5 parameter2 < parameter1*2;
-   }
-}
-
-Geometry {
-   BoundaryConditionXYZ {
-      velocity 'parameter1';
-      pressure 'var1 * 29.9';
-
-      BoundaryConditionZYX {
-         pressureIN 'parameter3';
-         pressureOUT 'var3-var2';
-      }
-   }
-
-   BoundaryConditionABC {
-      velocity 'parameter1';
-   }
-}
diff --git a/tests/core/math/equation_system/EquationSolverTest.cpp b/tests/core/math/equation_system/EquationSolverTest.cpp
deleted file mode 100644
index e711d04f8300e746ce14d922d7103c3afe3b0670..0000000000000000000000000000000000000000
--- a/tests/core/math/equation_system/EquationSolverTest.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file EquationSolverTest.cpp
-//! \ingroup core
-//! \author Matthias Markl <matthias.markl@fau.de>
-//
-//======================================================================================================================
-
-#include "core/DataTypes.h"
-#include "core/debug/TestSubsystem.h"
-#include "core/logging/Logging.h"
-#include "core/mpi/Environment.h"
-#include "core/math/equation_system/EquationParser.h"
-#include "core/math/equation_system/EquationSystem.h"
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-
-using namespace walberla;
-using namespace walberla::math;
-
-/*
-int directInput(){
-   EquationSystem es;
-   EquationParser ep(es);
-
-   std::string str;
-   uint_t index = 0;
-
-   std::cout << "\nWrite equations to solve without any blank!\nTo quit enter 'exit'\nTo clear all known variable enter 'clear'"<< std::endl;
-
-   bool run = true;
-   do {
-      std::cin >> str;
-      index = 0;
-      if ( strcmp(str.c_str(),"exit") == 0 )
-         return 0;
-      if ( strcmp(str.c_str(),"clear") == 0 ){
-         std::cout << "Clear known variables and equations" << std::endl;
-         es.clear();
-         continue;
-      }
-      try {
-         EquationPtr eqPtr = ep.parseEquation(str, index);
-         es.add(str, eqPtr);
-         std::cout << "Convert  '" << str << "'  to  '" << (*eqPtr) << "'" << std::endl;
-         if (eqPtr->isComputable()){
-            VarPtr varPtr = eqPtr->compute();
-            std::cout << "Equation is computable: " << varPtr->name() << "=" << varPtr->value() << std::endl;
-         } else if (eqPtr->isEvaluatable()){
-            std::cout << "Equation is evaluatable: " << (eqPtr->evaluate() ? "true" : "false") << std::endl;
-         } else {
-            std::cout << "Equation is neither computable nor evaluatable!" << std::endl;
-         }
-      } catch (std::runtime_error re) {
-         std::cerr << re.what() << std::endl;
-      }
-   } while (run);
-   return 0;
-}
- */
-
-
-int equationInput(){
-   std::vector<std::string> eqStringList;
-
-   //// Parameters
-   eqStringList.emplace_back("dt = 2e-7");
-   eqStringList.emplace_back("dx = 5e-6");
-   eqStringList.emplace_back("eta = 0.0001");
-   eqStringList.emplace_back("omega = 1.95");
-   eqStringList.emplace_back("rho = 1000");
-
-   //// LBM Equations
-   eqStringList.emplace_back("'rho_L' = 1.0");
-   eqStringList.emplace_back("'dt_L'  = 1.0");
-   eqStringList.emplace_back("'dx_L'  = 1.0");
-   eqStringList.emplace_back("'c'     = 'dx_L' / 'dt_L'");
-   eqStringList.emplace_back("'nu'    = 'eta' / 'rho'");
-   eqStringList.emplace_back("'nu_L'  = 'eta_L' / 'rho_L'");
-   eqStringList.emplace_back("'dt'    = ( 0.1 * 'dx' ) / 'maxOcurringPhysVel'");
-   eqStringList.emplace_back("'cs'    = ( 1.0 / ( 3.0 ^ 0.5 ) ) * 'c'");
-   eqStringList.emplace_back("'omega' = 1.0 / 'tau'");
-   eqStringList.emplace_back("'nu_L'  = ( 'cs' ^ 2.0 ) * ( 'tau' - ( 0.5 * 'dt_L' ) )");
-   /*
-   // Unsolvable:
-   // Parameters
-   eqStringList.push_back( "nu = 3.50E-006");
-   eqStringList.push_back( "omega = 1.99");
-   eqStringList.push_back( "rho = 1000");
-   eqStringList.push_back( "maxOcurringPhysVel = 0.10");
-
-
-   // LBM Equations
-   eqStringList.push_back( "'rho_L' = 1.0");
-   eqStringList.push_back( "'dt_L'  = 1.0");
-   eqStringList.push_back( "'dx_L'  = 1.0");
-   eqStringList.push_back( "'c'     = 'dx_L' / 'dt_L'");
-   eqStringList.push_back( "'nu'    = 'eta' / 'rho'");
-   eqStringList.push_back( "'nu_L'  = 'eta_L' / 'rho_L'");
-   eqStringList.push_back( "'dt'    = ( 0.1 * 'dx' ) / 'maxOcurringPhysVel'");
-   eqStringList.push_back( "'cs'    = ( 1.0 / ( 3.0 ^ 0.5 ) ) * 'c'");
-   eqStringList.push_back( "'omega' = 1.0 / 'tau'");
-   eqStringList.push_back( "'nu_L'  = ( 'cs' ^ 2.0 ) * ( 'tau' - ( 0.5 * 'dt_L' ) )");
-   eqStringList.push_back( "'nu_L'  = (nu * dt) / dx^2");
-    */
-
-   EquationSystem es;
-   EquationParser ep(es);
-   size_t index = 0;
-   size_t number = 0;
-
-   for (size_t i=0; i<eqStringList.size(); ++i){
-      index = 0;
-      es.add( std::to_string(++number), ep.parseEquation( eqStringList[i], index ) );
-   }
-
-   WALBERLA_CHECK( es.solve() );
-   //es.match();
-   WALBERLA_LOG_RESULT( es );
-   return 0;
-}
-
-int unitTest(double v)
-{
-   std::string s = std::to_string( v );
-
-   std::vector<std::string> eqStringList;
-   eqStringList.push_back(       "a = " + s );
-   eqStringList.push_back(   "a + 3 =   " + s + " + 3" );
-   eqStringList.push_back(   "3 + a =   3 + " + s + "" );
-   eqStringList.push_back(   "a - 3 =   " + s + " - 3" );
-   eqStringList.push_back(   "3 - a =   3 - " + s + "" );
-   eqStringList.push_back(   "a * 3 =   " + s + " * 3" );
-   eqStringList.push_back(   "3 * a =   3 * " + s + "" );
-   eqStringList.push_back(   "a / 3 =   " + s + " / 3" );
-   eqStringList.push_back(   "3 / a =   3 / " + s + "" );
-   eqStringList.push_back(   "a ^ 3 =   " + s + " ^ 3" );
-   eqStringList.push_back(   "3 ^ a =   3 ^ " + s + "" );
-   eqStringList.push_back( "sqrt(a) = sqrt(" + s + ")" );
-   eqStringList.push_back(  "exp(a) =  exp(" + s + ")" );
-   eqStringList.push_back(   "ln(a) =   ln(" + s + ")" );
-
-   EquationSystem es;
-   EquationParser ep(es);
-   uint_t index = 0;
-
-   for (uint_t i=0; i<eqStringList.size(); ++i){
-      index = 0;
-      es.add( eqStringList[i], ep.parseEquation( eqStringList[i], index ) );
-   }
-   WALBERLA_CHECK( es.solve() );
-
-   return 0;
-}
-
-int unitTests(unsigned int count){
-   srand( static_cast<unsigned int>(time(nullptr)) );
-
-   double values[] = {0.0, 1.0, 1e-15, 1e+15};
-   unsigned int size = 4;
-
-   int test = 0;
-   for (unsigned int i=0; i<size && test == 0; ++i){
-      test = unitTest( values[i] );
-   }
-
-   for (unsigned int i=0; i<count && test == 0; ++i){
-      double value = double(rand()) / RAND_MAX;
-      int exp = rand() / ( RAND_MAX / 30 ) - 14;
-      test = unitTest( pow( value, exp ) );
-   }
-
-   return test;
-}
-
-int main( int argc, char** argv )
-{
-   debug::enterTestMode();
-
-   mpi::Environment mpiEnv( argc, argv );
-
-   int value;
-   //value = unitTests(100);
-   value = equationInput();
-   //value = directInput();
-   return value;
-}
diff --git a/tests/cuda/communication/CommTest.cpp b/tests/cuda/communication/CommTest.cpp
deleted file mode 100644
index 8233ac5615a106413516627720ee563e98b4fe0a..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/CommTest.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//
-//======================================================================================================================
-
-
-#include "core/debug/TestSubsystem.h"
-#include "core/Environment.h"
-#include "core/mpi/Datatype.h"
-
-#include "field/communication/MPIDatatypes.h"
-#include "field/Field.h"
-
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-
-#define NUM_ITER  100
-#define SIZE_X    16
-#define SIZE_Y    16
-#define SIZE_Z    16
-#define LAYOUT    field::fzyx
-
-
-using namespace walberla;
-
-void hostToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0, LAYOUT);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		hostField2.set(hostField1);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void hostToDevice()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		cuda::fieldCpy(deviceField, hostField);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void deviceToHost()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-	cuda::fieldCpy(deviceField, hostField);
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		cuda::fieldCpy(hostField, deviceField);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiHostToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiHostToDevice()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) );
-	auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiDeviceToHost()
-{
-	Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) );
-	auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiDeviceToDevice()
-{
-	cuda::GPUField<double> deviceField1(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-	cuda::GPUField<double> deviceField2(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto deviceDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( deviceField1 ) );
-	auto deviceDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( deviceField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( deviceField1.data(), 1, deviceDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( deviceField2.data(), 1, deviceDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiCopyHostToDevice()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-
-		cuda::fieldCpy(deviceField, hostField2);
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-void mpiCopyDeviceToHost()
-{
-	Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
-	Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
-	cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
-
-	auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) );
-	auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) );
-
-	double startTime = MPI_Wtime();
-	for (int i = 0; i < NUM_ITER; ++i) {
-
-		MPI_Request request2;
-		MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 );
-
-		cuda::fieldCpy(hostField1, deviceField);
-
-		MPI_Request request1;
-		MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 );
-
-		MPI_Wait( &request1, MPI_STATUS_IGNORE );
-		MPI_Wait( &request2, MPI_STATUS_IGNORE );
-	}
-	double endTime = MPI_Wtime();
-	std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
-}
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   walberla::Environment walberlaEnv( argc, argv );
-
-	WALBERLA_CHECK_EQUAL(MPIManager::instance()->numProcesses(), 2);
-
-   hostToHost();
-   hostToDevice();
-   deviceToHost();
-   mpiHostToHost();
-   mpiHostToDevice();
-   mpiDeviceToHost();
-   mpiDeviceToDevice();
-   mpiCopyHostToDevice();
-   mpiCopyDeviceToHost();
-
-   return 0;
-}
diff --git a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
deleted file mode 100644
index 27fdba33bb1c0d921cdd989a7f3879d4580ce4b0..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//========================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUFieldPackInfoTest.cpp
-//! \ingroup cuda
-//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
-//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
-//
-//========================================================================================================================
-
-#include "core/DataTypes.h"
-#include "core/debug/TestSubsystem.h"
-#include "core/math/Random.h"
-#include "core/mpi/Environment.h"
-
-#include "stencil/Directions.h"
-#include "stencil/Iterator.h"
-#include "stencil/D3Q27.h"
-
-#include "domain_decomposition/BlockDataID.h"
-
-#include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-
-#include "field/GhostLayerField.h"
-
-#include "cuda/ErrorChecking.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/GPUPackInfo.h"
-
-#include <cuda_runtime.h>
-#include <vector>
-
-using namespace walberla;
-
-using DataType = walberla::uint_t;
-using StencilType = stencil::D3Q27;
-using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
-using GPUFieldType = cuda::GPUField< DataType >;
-using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
-using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
-
-static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf };
-static uint_t fieldLayoutIndex = 0;
-
-
-FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new FieldType(
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            DataType(0),                            // initial value
-            fieldLayouts[fieldLayoutIndex],         // layout
-            make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
-            );
-}
-
-
-GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new GPUFieldType(
-            storage->getNumberOfXCells( *block ), // number of cells in x direction
-            storage->getNumberOfYCells( *block ), // number of cells in y direction
-            storage->getNumberOfZCells( *block ), // number of cells in z direction
-            StencilType::Size,                    // number of cells for pdfs
-            1,                                    // one ghost layer
-            fieldLayouts[fieldLayoutIndex] );
-}
-
-
-void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
-{
-   for( auto block = blocks->begin(); block != blocks->end(); ++block )
-   {
-      auto fieldPtr = block->getData< FieldType >( fieldID );
-
-      for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
-         *fieldIt = math::intRandom< DataType >();
-   }
-}
-
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   mpi::Environment mpiEnv( argc, argv );
-
-
-   const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
-
-   uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
-
-   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
-   {
-      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
-                                                        cells[0], cells[1], cells[2], // cells
-                                                        1,                            // unit cell spacing
-                                                        true,                        // one block per process
-                                                        true, true, true);            // periodic in all directions
-
-      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
-                                                                               "ScalarField" );
-
-      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
-                                                                                   "syncGPUField" );
-
-      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
-                                                                                    "asyncGPUField" );
-
-      math::seedRandomGenerator( numeric_cast<std::mt19937::result_type>( MPIManager::instance()->rank() ) );
-      // Initialize CPU field with random values
-      initFields( blocks, sourceFieldId );
-
-      // Copy same CPU field to both GPU fields
-      for( auto block = blocks->begin(); block != blocks->end(); ++block )
-      {
-         auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
-
-         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
-         cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
-
-         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
-         cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
-      }
-
-      // Setup communication schemes for synchronous GPUPackInfo
-      CommSchemeType syncCommScheme(blocks);
-      syncCommScheme.addPackInfo( make_shared< GPUPackInfoType >( syncGPUFieldId ) );
-
-      // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
-      CommSchemeType asyncCommScheme(blocks);
-      asyncCommScheme.addPackInfo( make_shared< GPUPackInfoType >( asyncGPUFieldId ) );
-
-      // Perform one communication step for each scheme
-      syncCommScheme();
-      asyncCommScheme();
-
-      // Check results
-      FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
-                              make_shared< cuda::HostFieldAllocator< DataType > >() );
-      FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
-                               make_shared< cuda::HostFieldAllocator< DataType > >() );
-
-      for( auto block = blocks->begin(); block != blocks->end(); ++block )
-      {
-         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
-         cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
-
-         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
-         cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
-
-         for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
-                  syncIt != syncFieldCpu.end();
-                  ++syncIt, ++asyncIt )
-            WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
-      }
-   }
-
-
-   return EXIT_SUCCESS;
-}
diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp
deleted file mode 100644
index 0cafd76f5178022de70a3cf3d96e0fc2f139e7b5..0000000000000000000000000000000000000000
--- a/tests/cuda/communication/GPUPackInfoTest.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can 
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
-//  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
-//  for more details.
-//  
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file GPUFieldPackInfoTest.cpp
-//! \ingroup cuda
-//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
-//! \brief Tests if a GPUField is correctly packed into buffers
-//
-//======================================================================================================================
-
-#include "field/GhostLayerField.h"
-
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/GPUPackInfo.h"
-
-#include "blockforest/Initialization.h"
-
-#include "core/debug/TestSubsystem.h"
-#include "core/mpi/MPIManager.h"
-
-#include "stencil/D3Q27.h"
-
-#include <cstring>
-#include <vector>
-#include <cuda_runtime.h>
-
-#define F_SIZE    19
-
-using namespace walberla;
-
-static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
-static uint_t fieldLayoutIndex = 0;
-
-cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
-{
-   return new cuda::GPUField<int> (
-            storage->getNumberOfXCells( *block ), // number of cells in x direction
-            storage->getNumberOfYCells( *block ), // number of cells in y direction
-            storage->getNumberOfZCells( *block ), // number of cells in z direction
-            F_SIZE,                               // fSize
-            1,                                    // number of ghost layers
-            fieldLayouts[fieldLayoutIndex] );
-}
-
-// Tester base class. The communicate() template method allows testing different communication methods.
-class GPUPackInfoTester
-{
-public:
-
-   typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
-
-   GPUPackInfoTester( IBlock* block, BlockDataID fieldId ) :
-      block_( block ), fieldId_( fieldId ) {}
-
-   virtual ~GPUPackInfoTester() {}
-
-   void test( stencil::Direction dir )
-   {
-      cuda::GPUField<int> & gpuField = *(block_->getData<cuda::GPUField<int> >( fieldId_ ));
-
-      field::GhostLayerField<int,F_SIZE> cpuField(
-               gpuField.xSize(),       // number of cells in x direction
-               gpuField.ySize(),       // number of cells in y direction
-               gpuField.zSize(),       // number of cells in z direction
-               1,                      // number of ghost layers
-               0,                      // initial value
-               fieldLayouts[fieldLayoutIndex]);
-      cpuField.setWithGhostLayer( 0 );
-
-      int val = 0;
-      for ( auto it = cpuField.beginSliceBeforeGhostLayer( dir ); it != cpuField.end(); ++it )
-      {
-         *it = ++val;
-      }
-      cuda::fieldCpy( gpuField, cpuField );
-
-      GPUPackInfoType gpuPackInfo( fieldId_ );
-
-      communicate( gpuPackInfo, dir );
-      cuda::fieldCpy( cpuField, gpuField );
-
-      val = 0;
-      for ( auto it = cpuField.beginGhostLayerOnly( stencil::inverseDir[dir] ); it != cpuField.end(); ++it )
-      {
-         WALBERLA_CHECK_EQUAL( *it, ++val );
-      }
-
-   }
-
-protected:
-
-   virtual void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) = 0;
-
-   IBlock* block_;
-   BlockDataID fieldId_;
-};
-
-
-// Tester for buffer communication
-class GPUPackInfoBufferTester: public GPUPackInfoTester
-{
-public:
-   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId): GPUPackInfoTester( block, fieldId ) {}
-
-protected:
-   void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
-   {
-      mpi::GenericSendBuffer<> sendBuf;
-      sendBuf.addDebugMarker( "Be" );
-      gpuPackInfo.packData( block_, dir, sendBuf );
-      sendBuf.addDebugMarker( "Af" );
-
-      // Manually copy over the send to the receive buffer
-      mpi::GenericRecvBuffer<> recvBuf;
-      recvBuf.resize( sendBuf.size() );
-      memcpy( recvBuf.ptr(), sendBuf.ptr(), sendBuf.size() * sizeof(mpi::GenericSendBuffer<>::ElementType) );
-
-      recvBuf.readDebugMarker( "Be" );
-      gpuPackInfo.unpackData( block_,  stencil::inverseDir[dir], recvBuf );
-      recvBuf.readDebugMarker( "Af" );
-   }
-};
-
-
-// Tester for local communication
-class GPUPackInfoLocalTester: public GPUPackInfoTester
-{
-public:
-   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
-
-protected:
-   void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
-   {
-      gpuPackInfo.communicateLocal( block_, block_, dir );
-   }
-};
-
-
-int main(int argc, char **argv)
-{
-   using blockforest::createUniformBlockGrid;
-
-   debug::enterTestMode();
-   MPIManager::instance()->initializeMPI(&argc,&argv);
-
-   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
-   {
-      // Create BlockForest
-      uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
-      auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
-                                           2,2,2,          //cells
-                                           1,              //dx
-                                           false,          //one block per process
-                                           true,true,true);//periodicity
-
-      BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
-              &createGPUField, "ScalarGPUField" );
-
-      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
-      {
-         GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
-         GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
-
-         for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
-         {
-            localTester.test( *dir );
-            bufferTester.test( *dir );
-         }
-      }
-   }
-
-   return 0;
-}
diff --git a/tests/fft/GreensTest.cpp b/tests/fft/GreensTest.cpp
index cb8797f187c217f1ac787288a4737ee6a8f84456..e8eeaa8b9b60be7925eff281690846ac24a4c314 100644
--- a/tests/fft/GreensTest.cpp
+++ b/tests/fft/GreensTest.cpp
@@ -30,8 +30,8 @@ int main (int argc, char** argv)
    WALBERLA_ASSERT_EQUAL(cells_per_block*processes, L, "Number of processes per direction must evenly divide " << L);
    
    auto blocks = blockforest::createUniformBlockGrid(num_blocks,num_blocks,num_blocks, cells_per_block,cells_per_block,cells_per_block, 1.0, processes,processes,processes, true,true,true);
-   BlockDataID originalFieldId = field::addToStorage<Field_T>( blocks, "original", real_t(0), field::zyxf, 1 );
-   BlockDataID fftFieldId = field::addToStorage<Field_T>( blocks, "result", real_t(0), field::zyxf, 1 );
+   BlockDataID originalFieldId = field::addToStorage<Field_T>( blocks, "original", real_t(0), field::fzyx, 1 );
+   BlockDataID fftFieldId = field::addToStorage<Field_T>( blocks, "result", real_t(0), field::fzyx, 1 );
    
    auto comm = blockforest::communication::UniformBufferedScheme< stencil::D3Q7 >( blocks );
    comm.addPackInfo(make_shared< field::communication::PackInfo< Field_T > >( fftFieldId ));
diff --git a/tests/field/AddToStorageTest.cpp b/tests/field/AddToStorageTest.cpp
index 270826cea118210674c7ce377dc2e262096ad41c..3f5a47f6e6722331cc77e4a5a01958af8d622214 100644
--- a/tests/field/AddToStorageTest.cpp
+++ b/tests/field/AddToStorageTest.cpp
@@ -42,7 +42,7 @@ int main( int argc, char ** argv )
                                                        );
    typedef GhostLayerField<Vector3<uint_t>,1> VectorField;
    typedef GhostLayerField<uint_t, 3> FlattenedField;
-   BlockDataID fieldID = field::addToStorage<VectorField>( blocks, "Field" );
+   BlockDataID fieldID = field::addToStorage<VectorField>( blocks, "Field", Vector3<uint_t>(uint_c(0)), field::zyxf );
    BlockDataID flattenedID = field::addFlattenedShallowCopyToStorage<VectorField>( blocks, fieldID, "flattened Field");
 
    for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
@@ -58,9 +58,9 @@ int main( int argc, char ** argv )
          }
       }
    }
-   
+
    BlockDataID copyID = field::addCloneToStorage<VectorField>( blocks, fieldID, "copied Field");
-   
+
    for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
    {
       VectorField * field = blockIt->getData<VectorField>( fieldID );
diff --git a/tests/field/CMakeLists.txt b/tests/field/CMakeLists.txt
index cf1e69f77d390fedbbe5c72920bf6a40cae9fec4..b48f4ac79d1a778ccd1cadad910b6fab00a99b36 100644
--- a/tests/field/CMakeLists.txt
+++ b/tests/field/CMakeLists.txt
@@ -28,6 +28,9 @@ waLBerla_execute_test( NAME FieldTiming  )
 waLBerla_compile_test( FILES FlagFieldTest.cpp)
 waLBerla_execute_test( NAME FlagFieldTest )
 
+waLBerla_compile_test( FILES StabilityCheckerTest.cpp DEPENDS blockforest field timeloop )
+waLBerla_execute_test( NAME StabilityCheckerTest )
+
 waLBerla_compile_test( FILES interpolators/InterpolationTest.cpp)
 waLBerla_execute_test( NAME InterpolationTest )
 
@@ -68,6 +71,11 @@ waLBerla_generate_target_from_python(NAME CodegenJacobiCPUGeneratedJacobiKernel
 waLBerla_compile_test( FILES codegen/CodegenJacobiCPU.cpp DEPENDS gui timeloop CodegenJacobiCPUGeneratedJacobiKernel)
 waLBerla_execute_test( NAME CodegenJacobiCPU )
 
+waLBerla_generate_target_from_python(NAME SweepCollectionKernel FILE codegen/SweepCollection.py
+        OUT_FILES SweepCollection.h SweepCollection.cpp)
+waLBerla_compile_test( FILES codegen/SweepCollection.cpp DEPENDS timeloop SweepCollectionKernel)
+waLBerla_execute_test( NAME SweepCollection )
+
 waLBerla_generate_target_from_python(NAME CodegenPoissonCPUGeneratedKernel FILE codegen/Poisson.py
       OUT_FILES Poisson.cpp Poisson.h )
 waLBerla_compile_test( FILES codegen/CodegenPoissonCPU.cpp DEPENDS gui timeloop CodegenPoissonCPUGeneratedKernel)
diff --git a/tests/field/FieldTest.cpp b/tests/field/FieldTest.cpp
index 2021554dd992aa7ff26c71b2740f81712f4ce411..3214a77490d11e77ec67e92bc3c9541381977037 100644
--- a/tests/field/FieldTest.cpp
+++ b/tests/field/FieldTest.cpp
@@ -580,7 +580,7 @@ void fieldPointerTest()
 template<uint_t fSize>
 void flattenTest()
 {
-   Field<Vector3<uint_t>, fSize> field ( 2,2,1 );
+   Field<Vector3<uint_t>, fSize> field ( 2,2,1,field::zyxf );
 
    for( cell_idx_t x = 0; x < cell_idx_c(field.xSize()); ++x )
       for( cell_idx_t y = 0; y < cell_idx_c(field.ySize()); ++y )
@@ -594,7 +594,7 @@ void flattenTest()
 
    shared_ptr<Field<uint_t, 3*fSize>> flattened(field.flattenedShallowCopy());
 
-   Field<uint_t, 3*fSize> cmp ( 2,2,1 );
+   Field<uint_t, 3*fSize> cmp ( 2,2,1,field::zyxf );
    WALBERLA_CHECK_EQUAL(cmp.xSize(), flattened->xSize());
    WALBERLA_CHECK_EQUAL(cmp.ySize(), flattened->ySize());
    WALBERLA_CHECK_EQUAL(cmp.zSize(), flattened->zSize());
@@ -626,7 +626,7 @@ void flattenTest()
 template<uint_t fSize>
 void ghostFlattenTest()
 {
-   GhostLayerField<Vector3<uint_t>, fSize> field ( 2,2,1, 1 );
+   GhostLayerField<Vector3<uint_t>, fSize> field ( 2,2,1,1,field::zyxf );
 
    for( cell_idx_t x = -cell_idx_c(field.nrOfGhostLayers()); x < cell_idx_c(field.xSize()+field.nrOfGhostLayers()); ++x )
       for( cell_idx_t y = -cell_idx_c(field.nrOfGhostLayers()); y < cell_idx_c(field.ySize()+field.nrOfGhostLayers()); ++y )
@@ -640,7 +640,7 @@ void ghostFlattenTest()
 
    shared_ptr<GhostLayerField<uint_t, 3*fSize>> flattened(field.flattenedShallowCopy());
 
-   GhostLayerField<uint_t, 3*fSize> cmp ( 2,2,1, 1 );
+   GhostLayerField<uint_t, 3*fSize> cmp ( 2,2,1,1,field::zyxf );
    WALBERLA_CHECK_EQUAL(cmp.xSize(), flattened->xSize());
    WALBERLA_CHECK_EQUAL(cmp.ySize(), flattened->ySize());
    WALBERLA_CHECK_EQUAL(cmp.zSize(), flattened->zSize());
diff --git a/tests/field/StabilityCheckerTest.cpp b/tests/field/StabilityCheckerTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb3d9046d1cf712d07e2f85fe176d962de50bead
--- /dev/null
+++ b/tests/field/StabilityCheckerTest.cpp
@@ -0,0 +1,76 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file StabilityCheckerTest.cpp
+//! \ingroup field
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/all.h"
+#include "core/all.h"
+#include "domain_decomposition/all.h"
+#include "field/all.h"
+#include "timeloop/all.h"
+
+
+
+namespace walberla {
+
+using Field_T = GhostLayerField<real_t, 1>;
+
+class TestSweep
+{
+ public:
+   TestSweep(BlockDataID fieldID) : fieldID_(fieldID) {}
+
+   void operator()(IBlock* const block)
+   {
+      Field_T* field = block->getData< Field_T >(fieldID_);
+
+      WALBERLA_FOR_ALL_CELLS(fieldIt, field, { *fieldIt += Field_T::value_type(1); }) // WALBERLA_FOR_ALL_CELLS
+   }
+
+ private:
+   BlockDataID fieldID_;
+};
+
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   walberla::Environment walberlaEnv( argc, argv );
+
+   auto blocks = blockforest::createUniformBlockGrid( 1, 1, 1,
+                                                      4, 4, 4,
+                                                      1.0);
+
+   BlockDataID fieldID = field::addToStorage<Field_T>( blocks, "Field", Field_T::value_type(0));
+   SweepTimeloop timeloop(blocks->getBlockStorage(), uint_c(2));
+
+   timeloop.add() << Sweep(TestSweep(fieldID), "Test Sweep");
+
+   // LBM stability check
+   auto checkFunction = [](Field_T::value_type value) {return value < math::abs(Field_T::value_type(5));};
+   timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< Field_T >( blocks, fieldID, uint_c(1), checkFunction) ),"Stability check" );
+   timeloop.run();
+
+   return EXIT_SUCCESS;
+}
+}
+
+int main( int argc, char ** argv )
+{
+   walberla::main(argc, argv);
+}
diff --git a/tests/field/codegen/CodegenJacobiCPU.cpp b/tests/field/codegen/CodegenJacobiCPU.cpp
index 3bba9623ed02f18521431ac492f3b2c4d2a584d3..6755c687a9ff0496e02c0b776dcc90595b151090 100644
--- a/tests/field/codegen/CodegenJacobiCPU.cpp
+++ b/tests/field/codegen/CodegenJacobiCPU.cpp
@@ -84,7 +84,7 @@ void testJacobi2D()
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( fieldID );
 
-   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 4.0));
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 4.0))
 }
 
 
@@ -132,7 +132,7 @@ void testJacobi3D()
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( fieldID );
-   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 8.0));
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_c(1.0 / 8.0))
 }
 
 
diff --git a/tests/field/codegen/SweepCollection.cpp b/tests/field/codegen/SweepCollection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33c8d2be099b9146d6738a5d7809023dbb7fa3b4
--- /dev/null
+++ b/tests/field/codegen/SweepCollection.cpp
@@ -0,0 +1,89 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SweepCollection.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "timeloop/SweepTimeloop.h"
+#include "SweepCollection.h"
+
+using namespace walberla;
+
+typedef GhostLayerField<real_t, 1> ScalarField;
+using SweepCollection_T = pystencils::SweepCollection;
+
+void testSweepCollection()
+{
+   uint_t xSize = 20;
+   uint_t ySize = 20;
+   uint_t zSize = 20;
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+           uint_t(1) , uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
+           xSize, ySize, zSize,            // how many cells per block (x,y,z)
+           real_c(1.0),                          // dx: length of one cell in physical coordinates
+           false,                              // one block per process - "false" means all blocks to one process
+           true, true, true );                 // full periodicity
+
+
+   const real_t initField1 = real_c(1.0);
+   const real_t initField2 = real_c(0.0);
+   const real_t initField3 = real_c(0.0);
+   const real_t a = real_c(2.0);
+
+   const BlockDataID field1ID = field::addToStorage<ScalarField>(blocks, "Field1", initField1);
+   const BlockDataID field2ID = field::addToStorage<ScalarField>(blocks, "Field2", initField2);
+   const BlockDataID field3ID = field::addToStorage<ScalarField>(blocks, "Field3", initField3);
+
+   SweepCollection_T sweepCollection(blocks, field1ID, field2ID, field3ID, a);
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(100);
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << Sweep( sweepCollection.fct1(SweepCollection_T::ALL), "fc1" );
+   timeloop.add() << Sweep( sweepCollection.fct2(SweepCollection_T::ALL), "fc2" );
+
+   timeloop.run();
+
+   auto firstBlock = blocks->begin();
+   auto field1 = firstBlock->getData<ScalarField>( field1ID );
+   auto field2 = firstBlock->getData<ScalarField>( field2ID );
+   auto field3 = firstBlock->getData<ScalarField>( field3ID );
+
+   WALBERLA_CHECK_FLOAT_EQUAL(field1->get(0,0,0), initField1)
+   WALBERLA_CHECK_FLOAT_EQUAL(field2->get(0,0,0), initField1 * real_c(2.0) * a)
+   WALBERLA_CHECK_FLOAT_EQUAL(field3->get(0,0,0), initField1 * real_c(2.0) * a * real_c(2.0) * a)
+}
+
+
+int main( int argc, char ** argv )
+{
+   mpi::Environment env( argc, argv );
+   debug::enterTestMode();
+
+   testSweepCollection();
+   return EXIT_SUCCESS;
+}
diff --git a/tests/field/codegen/SweepCollection.py b/tests/field/codegen/SweepCollection.py
new file mode 100644
index 0000000000000000000000000000000000000000..1229a2e2ec4594e7f10596b08dd7801f3a0d465a
--- /dev/null
+++ b/tests/field/codegen/SweepCollection.py
@@ -0,0 +1,19 @@
+import sympy as sp
+
+import pystencils as ps
+from pystencils import Assignment
+from pystencils_walberla import CodeGeneration, function_generator, generate_sweep_collection
+
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    a = sp.Symbol('a')
+    f1, f2, f3 = ps.fields(f"f1, f2, f3: {field_type}[3D]", layout='fzyx')
+    up1 = Assignment(f2.center, 2 * a * f1.center)
+    up2 = Assignment(f3.center, 2 * a * f2.center)
+
+    fct1 = function_generator(ctx, 'fct1', up1)
+    fct2 = function_generator(ctx, 'fct2', up2)
+
+    generate_sweep_collection(ctx, "SweepCollection", [fct1, fct2])
diff --git a/tests/field/distributors/DistributionTest.cpp b/tests/field/distributors/DistributionTest.cpp
index daf4784ba15e9ca733b49622eead02b63b8c9c41..a96429c326eb5ef49fb200d7b31ab7f78873f8d4 100644
--- a/tests/field/distributors/DistributionTest.cpp
+++ b/tests/field/distributors/DistributionTest.cpp
@@ -511,9 +511,9 @@ int main(int argc, char **argv) {
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers, false, initFlagField );
 
    // data fields
-   BlockDataID scalarFieldID         = field::addToStorage< ScalarField_T >( blocks, "scalar field", real_t(0), field::zyxf, FieldGhostLayers );
-   BlockDataID vectorFieldID         = field::addToStorage< Vec3Field_T >( blocks, "vec3 field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID multiComponentFieldID = field::addToStorage< MultiComponentField_T >( blocks, "multi component field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID scalarFieldID         = field::addToStorage< ScalarField_T >( blocks, "scalar field", real_t(0), field::fzyx, FieldGhostLayers );
+   BlockDataID vectorFieldID         = field::addToStorage< Vec3Field_T >( blocks, "vec3 field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID multiComponentFieldID = field::addToStorage< MultiComponentField_T >( blocks, "multi component field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // test all distributors with domain flags everywhere, i.e. without special boundary treatment necessary
    testNearestNeighborDistributor(blocks, flagFieldID, scalarFieldID, vectorFieldID, multiComponentFieldID);
diff --git a/tests/field/interpolators/FieldInterpolationTest.cpp b/tests/field/interpolators/FieldInterpolationTest.cpp
index 3571c3fd9cf154b98d88801301db89181e8b47b5..712f89de97c12a9dae6d8f9f8ec6b7c1c9639009 100644
--- a/tests/field/interpolators/FieldInterpolationTest.cpp
+++ b/tests/field/interpolators/FieldInterpolationTest.cpp
@@ -399,9 +399,9 @@ int main(int argc, char **argv) {
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers, false, initFlagField );
 
    // data fields
-   BlockDataID scalarFieldID         = field::addToStorage< ScalarField_T >( blocks, "scalar field", real_t(0), field::zyxf, FieldGhostLayers );
-   BlockDataID vectorFieldID         = field::addToStorage< Vec3Field_T >( blocks, "vec3 field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID multiComponentFieldID = field::addToStorage< MultiComponentField_T >( blocks, "multi component field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID scalarFieldID         = field::addToStorage< ScalarField_T >( blocks, "scalar field", real_t(0), field::fzyx, FieldGhostLayers );
+   BlockDataID vectorFieldID         = field::addToStorage< Vec3Field_T >( blocks, "vec3 field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID multiComponentFieldID = field::addToStorage< MultiComponentField_T >( blocks, "multi component field", real_t(0), field::fzyx, FieldGhostLayers );
 
    initScalarField(blocks, scalarFieldID);
    initVectorField(blocks, vectorFieldID );
diff --git a/tests/cuda/AlignmentTest.cpp b/tests/gpu/AlignmentTest.cpp
similarity index 93%
rename from tests/cuda/AlignmentTest.cpp
rename to tests/gpu/AlignmentTest.cpp
index 3de12c7628c98da8797d442f6236d7893829788e..12d50e6c085cb505ed1855855b43845cb95e0abc 100644
--- a/tests/cuda/AlignmentTest.cpp
+++ b/tests/gpu/AlignmentTest.cpp
@@ -18,14 +18,14 @@
 //
 //======================================================================================================================
 
-#include "cuda/AlignedAllocation.h"
+#include "gpu/AlignedAllocation.h"
 #include "core/mpi/Environment.h"
 #include "core/debug/TestSubsystem.h"
 #include "core/logging/Logging.h"
 
 
 using namespace walberla;
-using namespace cuda;
+using namespace gpu;
 
 
 int main( int argc, char ** argv )
@@ -39,12 +39,12 @@ int main( int argc, char ** argv )
    size_t alignment = 512;
    size_t offset = 16;
    void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
-   WALBERLA_LOG_INFO("Pitch " << pitch);
+   WALBERLA_LOG_INFO("Pitch " << pitch)
 
    char * cptr = reinterpret_cast<char*>( ptr );
    WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
 
    free_aligned_with_offset( ptr );
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/CMakeLists.txt b/tests/gpu/CMakeLists.txt
similarity index 97%
rename from tests/cuda/CMakeLists.txt
rename to tests/gpu/CMakeLists.txt
index 723f7818710f502a792d0e5732fc0cf70f4bec01..e760cca4db10704cbedfef89a4faca71631ea730 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/gpu/CMakeLists.txt
@@ -1,6 +1,6 @@
 ###################################################################################################
 #
-# Tests for cuda
+# Tests for gpu
 #
 ###################################################################################################
 
@@ -34,7 +34,7 @@ waLBerla_execute_test( NAME CodegenJacobiGPU )
 
 waLBerla_generate_target_from_python(NAME CodegenPoissonGPUGeneratedKernel FILE codegen/CudaPoisson.py
         OUT_FILES PoissonGPU.cu PoissonGPU.h )
-waLBerla_compile_test( FILES codegen/CodegenPoissonGPU.cpp DEPENDS gui cuda timeloop CodegenPoissonGPUGeneratedKernel)
+waLBerla_compile_test( FILES codegen/CodegenPoissonGPU.cpp DEPENDS gui gpu timeloop CodegenPoissonGPUGeneratedKernel)
 waLBerla_execute_test( NAME CodegenPoissonGPU )
 
 # The following tests work only for CUDA enabled MPI
diff --git a/tests/cuda/CudaMPI.cpp b/tests/gpu/CudaMPI.cpp
similarity index 97%
rename from tests/cuda/CudaMPI.cpp
rename to tests/gpu/CudaMPI.cpp
index 56d03807f25da2e62e1616d95cf52abd8d92dd9c..20cee24788dd92888563c16a25b8fdf7bed07e90 100644
--- a/tests/cuda/CudaMPI.cpp
+++ b/tests/gpu/CudaMPI.cpp
@@ -27,7 +27,7 @@
 #include "core/logging/Logging.h"
 #include "core/mpi/Datatype.h"
 
-#include "cuda/GPUField.h"
+#include "gpu/GPUField.h"
 
 #include "field/communication/MPIDatatypes.h"
 #include "field/AddToStorage.h"
@@ -44,7 +44,7 @@ void fullFieldTransfer()
    Field<double,4>  h_f1 ( 3, 4, 2, 42.0, field::fzyx );
    Field<double,4>  h_f2 ( 3, 4, 2, 27.0, field::fzyx );
 
-   cuda::GPUField<double> d_f ( 3, 4, 2, 4, 0, field::fzyx );
+   gpu::GPUField<double> d_f ( 3, 4, 2, 4, 0, field::fzyx );
 
 
    // Transfer h_f1 from CPU to GPU d_f
@@ -94,7 +94,7 @@ void blockStorageAndGui( int argc, char ** argv )
    BlockDataID cpuFieldID1 = field::addToStorage<ScalarField>( blocks, "CPUField 1", real_c(42), field::fzyx, uint_c(1) );
    BlockDataID cpuFieldID2 = field::addToStorage<ScalarField>( blocks, "CPUField 2", real_c(0),  field::fzyx, uint_c(1) );
 
-   typedef cuda::GPUField<real_t> GPUField;
+   typedef gpu::GPUField<real_t> GPUField;
    BlockDataID gpuFieldID = blocks->addStructuredBlockData< GPUField >(
             [&] ( IBlock * block, StructuredBlockStorage * const s ) {
                return new GPUField( s->getNumberOfXCells(*block),
diff --git a/tests/cuda/FieldIndexing3DTest.cpp b/tests/gpu/FieldIndexing3DTest.cpp
similarity index 78%
rename from tests/cuda/FieldIndexing3DTest.cpp
rename to tests/gpu/FieldIndexing3DTest.cpp
index 4ad2622bc3fb112c6ac960840f2af33af79f65c0..82c677070fbb202cdcd3f7fcc77ac08c9e045a46 100644
--- a/tests/cuda/FieldIndexing3DTest.cpp
+++ b/tests/gpu/FieldIndexing3DTest.cpp
@@ -22,23 +22,22 @@
 
 #include "core/debug/TestSubsystem.h"
 #include "core/Environment.h"
-#include "core/mpi/Datatype.h"
 
 #include "field/GhostLayerField.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/Kernel.h"
-#include "cuda/FieldIndexing3D.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/Kernel.h"
+#include "gpu/FieldIndexing3D.h"
 
 #include "FieldIndexing3DTest.h"
 
 using namespace walberla;
 
 
-typedef cuda::FieldIndexing3D<int> FieldIdx3D_T;
-typedef GhostLayerField<int , F_SIZE> HostField_T;
-typedef cuda::GPUField<int> GPUField_T;
+using FieldIdx3D_T = gpu::FieldIndexing3D<int>;
+using HostField_T  = GhostLayerField<int , F_SIZE>;
+using GPUField_T   = gpu::GPUField<int> ;
 
 
 
@@ -46,14 +45,14 @@ void xyzTest()
 {
    const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT );
-   cuda::fieldCpy( deviceField, emptyField );
+   gpu::fieldCpy( deviceField, emptyField );
 
-   auto setValue = cuda::make_kernel( &setValueKernel );
+   auto setValue = gpu::make_kernel( &setValueKernel );
    setValue.addFieldIndexingParam( FieldIdx3D_T::xyz( deviceField ) );
    setValue();
 
    HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
-   cuda::fieldCpy( resultField, deviceField );
+   gpu::fieldCpy( resultField, deviceField );
 
    HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    WALBERLA_FOR_ALL_CELLS_XYZ( &expectedField,
@@ -63,7 +62,7 @@ void xyzTest()
       }
    )
 
-   WALBERLA_ASSERT( resultField == expectedField );
+   WALBERLA_ASSERT( resultField == expectedField )
 }
 
 
@@ -71,14 +70,14 @@ void sliceBeforeGhostLayerXYZTest()
 {
    const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT );
-   cuda::fieldCpy( deviceField, emptyField );
+   gpu::fieldCpy( deviceField, emptyField );
 
-   auto setValue = cuda::make_kernel( &setValueKernel );
+   auto setValue = gpu::make_kernel( &setValueKernel );
    setValue.addFieldIndexingParam( FieldIdx3D_T::sliceBeforeGhostLayerXYZ( deviceField, 1, stencil::B, true ) );
    setValue();
 
    HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
-   cuda::fieldCpy( resultField, deviceField );
+   gpu::fieldCpy( resultField, deviceField );
 
    HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT );
    CellInterval ci;
@@ -89,17 +88,17 @@ void sliceBeforeGhostLayerXYZTest()
          expectedField.get( x, y, z, f ) = IDX4D( x - ci.xMin(), y - ci.yMin(), z - ci.zMin(), f );
       }
    )
-   WALBERLA_ASSERT( resultField == expectedField );
+   WALBERLA_ASSERT( resultField == expectedField )
 }
 
 
 int main( int argc, char ** argv )
 {
    debug::enterTestMode();
-   walberla::Environment walberlaEnv( argc, argv );
+   walberla::Environment const walberlaEnv( argc, argv );
 
    xyzTest();
    sliceBeforeGhostLayerXYZTest();
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/FieldIndexing3DTest.cu b/tests/gpu/FieldIndexing3DTest.cu
similarity index 87%
rename from tests/cuda/FieldIndexing3DTest.cu
rename to tests/gpu/FieldIndexing3DTest.cu
index 7ade5bfc6a4149ffb65255c1e7a99aa9e159b72d..edbec01be8150c298aa0fe2a3b703d5156bb867b 100644
--- a/tests/cuda/FieldIndexing3DTest.cu
+++ b/tests/gpu/FieldIndexing3DTest.cu
@@ -25,9 +25,9 @@ namespace walberla {
 
 __global__ void setValueKernel( FieldAccessor3D_T fa )
 {
-   unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-   unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-   unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
+   unsigned int const x = blockIdx.x * blockDim.x + threadIdx.x;
+   unsigned int const y = blockIdx.y * blockDim.y + threadIdx.y;
+   unsigned int const z = blockIdx.z * blockDim.z + threadIdx.z;
    fa.set( blockIdx, threadIdx );
 
    if ( fa.isValidPosition() )
diff --git a/tests/cuda/FieldIndexing3DTest.h b/tests/gpu/FieldIndexing3DTest.h
similarity index 94%
rename from tests/cuda/FieldIndexing3DTest.h
rename to tests/gpu/FieldIndexing3DTest.h
index 80e1b6cfea60e65d490d1e6c65fc03a0f62660f5..54dcc86053a7d4c8dce079704e6a9926064e3c98 100644
--- a/tests/cuda/FieldIndexing3DTest.h
+++ b/tests/gpu/FieldIndexing3DTest.h
@@ -20,7 +20,7 @@
 
 #pragma once
 
-#include "cuda/FieldAccessor3D.h"
+#include "gpu/FieldAccessor3D.h"
 
 #define X_SIZE    (64-2)
 #define Y_SIZE    (64-2)
@@ -37,7 +37,7 @@
 
 namespace walberla {
 
-typedef cuda::FieldAccessor3D<int> FieldAccessor3D_T;
+using FieldAccessor3D_T = gpu::FieldAccessor3D<int>;
 
 __global__ void setValueKernel( FieldAccessor3D_T fa );
 
diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/gpu/FieldTransferTest.cpp
similarity index 75%
rename from tests/cuda/FieldTransferTest.cpp
rename to tests/gpu/FieldTransferTest.cpp
index 7a41330a23ab24e7aaeb2055efe9155dc3aa4ca2..c8f5126adf99827aa723c9ec8e7ab45ee6a03185 100644
--- a/tests/cuda/FieldTransferTest.cpp
+++ b/tests/gpu/FieldTransferTest.cpp
@@ -24,8 +24,8 @@
 
 #include "field/Field.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
 #include "core/math/Random.h"
 
 
@@ -40,19 +40,18 @@ void simpleTransfer()
       h_f1(x, y, z, 0) = math::realRandom<double>();
    )
 
-   cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
+   gpu::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
 
-   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
-   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
-   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
-   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
-   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
+   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize())
+   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize())
+   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize())
+   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize())
+   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout())
 
+   gpu::fieldCpy( d_f, h_f1 );
+   gpu::fieldCpy( h_f2, d_f );
 
-   cuda::fieldCpy( d_f, h_f1 );
-   cuda::fieldCpy( h_f2, d_f );
-
-   WALBERLA_CHECK_EQUAL( h_f1, h_f2 );
+   WALBERLA_CHECK_EQUAL( h_f1, h_f2 )
 }
 
 
@@ -63,5 +62,5 @@ int main( int argc, char **argv )
 
    simpleTransfer();
 
-   return 0;
+   return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/Kernels.cu b/tests/gpu/Kernels.cu
similarity index 57%
rename from tests/cuda/Kernels.cu
rename to tests/gpu/Kernels.cu
index e0d4c2f9762218960b4ffe6e9285f3a4fc9f2984..daefdb58f559db4b94daa7111c2156a0c7ee4a6e 100644
--- a/tests/cuda/Kernels.cu
+++ b/tests/gpu/Kernels.cu
@@ -1,14 +1,15 @@
-#include "cuda/FieldAccessor.h"
+#include "gpu/FieldAccessor.h"
 
 namespace walberla {
 
 
-namespace cuda {
+namespace gpu
+{
    template<typename T>
    class GPUField;
 }
 
-__global__ void kernel_double( cuda::FieldAccessor<double> f )
+__global__ void kernel_double(gpu::FieldAccessor<double> f )
 {
    f.set( blockIdx, threadIdx );
    f.get() *= 2.0;
diff --git a/tests/cuda/SimpleKernelTest.cpp b/tests/gpu/SimpleKernelTest.cpp
similarity index 71%
rename from tests/cuda/SimpleKernelTest.cpp
rename to tests/gpu/SimpleKernelTest.cpp
index f2f9a2a8b4ceb2cec7f032d4e732938d281ef63b..a4fd1db371912840bbca12eadda20b942588b5af 100644
--- a/tests/cuda/SimpleKernelTest.cpp
+++ b/tests/gpu/SimpleKernelTest.cpp
@@ -18,7 +18,6 @@
 //
 //======================================================================================================================
 
-#include "cuda/FieldIndexing.h"
 #include "blockforest/Initialization.h"
 
 #include "core/debug/TestSubsystem.h"
@@ -26,17 +25,16 @@
 
 #include "field/GhostLayerField.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/Kernel.h"
-#include "gui/Gui.h"
-#include "timeloop/SweepTimeloop.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/Kernel.h"
 
 using namespace walberla;
 
 namespace walberla{
 
-void kernel_double( cuda::FieldAccessor<double> f );
+void kernel_double(gpu::FieldAccessor<double> f );
 }
 
 GhostLayerField<double,1> * createCPUField( IBlock* const block, StructuredBlockStorage* const storage )
@@ -50,9 +48,9 @@ GhostLayerField<double,1> * createCPUField( IBlock* const block, StructuredBlock
       field::fzyx);
 }
 
-cuda::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
+gpu::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
 {
-   return new cuda::GPUField<double> (
+   return new gpu::GPUField<double> (
       storage->getNumberOfXCells( *block ), // number of cells in x direction
       storage->getNumberOfYCells( *block ), // number of cells in y direction
       storage->getNumberOfZCells( *block ), // number of cells in z direction
@@ -64,10 +62,10 @@ cuda::GPUField<double> * createGPUField( IBlock* const block, StructuredBlockSto
 
 int main( int argc, char ** argv )
 {
-   walberla::Environment env( argc, argv );
+   walberla::Environment const env( argc, argv );
    debug::enterTestMode();
 
-   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+   shared_ptr< StructuredBlockForest > const blocks = blockforest::createUniformBlockGrid (
       uint_t(1),   uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
       uint_t(14),  uint_t(14), uint_t(14), // how many cells per block (x,y,z)
       real_c(0.5),                         // dx: length of one cell in physical coordinates
@@ -76,10 +74,10 @@ int main( int argc, char ** argv )
 
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData< GhostLayerField<double,1> > ( &createCPUField, "CPUField" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData< GhostLayerField<double,1> > ( &createCPUField, "CPUField" );
 
 
-   BlockDataID gpuFieldID = blocks->addStructuredBlockData< cuda::GPUField<double>    > ( &createGPUField, "GPUField" );
+   BlockDataID const gpuFieldID = blocks->addStructuredBlockData< gpu::GPUField<double>    > ( &createGPUField, "GPUField" );
 
    for ( auto blockIterator = blocks->begin(); blockIterator != blocks->end(); ++blockIterator )
    {
@@ -87,26 +85,19 @@ int main( int argc, char ** argv )
 
       // get the field stored on the current block
       auto cpuField = currentBlock.getData< GhostLayerField<double,1> > ( cpuFieldID );
-      auto gpuField = currentBlock.getData< cuda::GPUField<double>    > ( gpuFieldID );
+      auto gpuField = currentBlock.getData< gpu::GPUField<double>    > ( gpuFieldID );
 
-      cuda::fieldCpy( *gpuField, *cpuField );
+      gpu::fieldCpy( *gpuField, *cpuField );
 
-      auto myKernel = cuda::make_kernel( &kernel_double );
-      auto indexing = cuda::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( *gpuField, 1, stencil::W, true );
+      auto myKernel = gpu::make_kernel( &kernel_double );
+      auto indexing = gpu::FieldIndexing<double>::sliceBeforeGhostLayerXYZ( *gpuField, 1, stencil::W, true );
       myKernel.addFieldIndexingParam(indexing);
       myKernel();
 
-      cuda::fieldCpy( *cpuField, *gpuField );
+      gpu::fieldCpy( *cpuField, *gpuField );
 
       WALBERLA_ASSERT_FLOAT_EQUAL( cpuField->get(0,0,0), real_t(2) )
    }
 
-
-   //SweepTimeloop timeloop ( blocks, uint_t(1) );
-   //timeloop.run();
-   //GUI gui ( timeloop, blocks, argc, argv );
-   //gui.run();
-
-
    return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/codegen/CodegenJacobiGPU.cpp b/tests/gpu/codegen/CodegenJacobiGPU.cpp
similarity index 71%
rename from tests/cuda/codegen/CodegenJacobiGPU.cpp
rename to tests/gpu/codegen/CodegenJacobiGPU.cpp
index 93814e0a51bbff7398a57fd3e3130ce751549121..67e43894359123c9228819f48bfb972b20cf6e88 100644
--- a/tests/cuda/codegen/CodegenJacobiGPU.cpp
+++ b/tests/gpu/codegen/CodegenJacobiGPU.cpp
@@ -21,7 +21,7 @@
 #include "CudaJacobiKernel2D.h"
 #include "CudaJacobiKernel3D.h"
 
-#include "cuda/HostFieldAllocator.h"
+#include "gpu/HostFieldAllocator.h"
 #include "blockforest/Initialization.h"
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
@@ -29,11 +29,11 @@
 #include "core/Environment.h"
 #include "core/debug/TestSubsystem.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/FieldIndexing.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -49,27 +49,28 @@
 
 using namespace walberla;
 
-typedef GhostLayerField<real_t, 1> ScalarField;
-typedef cuda::GPUField<real_t> GPUField;
+using ScalarField = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
 
 
 ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
 {
-   return new ScalarField (
-            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
-            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
-            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
-            1,                                      // one ghost layer
-            double(0),                              // initial value
-            field::fzyx,                            // layout
-            make_shared<cuda::HostFieldAllocator<real_t> >()  // allocator for host pinned memory
-            );
+   auto xSize = storage->getNumberOfXCells( *block );
+   auto ySize = storage->getNumberOfYCells( *block );
+   auto zSize = storage->getNumberOfZCells( *block );
+   auto numberOfGhostLayers = uint_c(1);
+   auto initialValue = real_c(0);
+   auto fieldLayout = field::fzyx;
+   return new ScalarField (xSize, ySize, zSize,
+                          numberOfGhostLayers, initialValue, fieldLayout,
+                          make_shared< gpu::HostFieldAllocator<real_t> >()  // allocator for host pinned memory
+   );
 }
 
 void testJacobi2D()
 {
-   uint_t xSize = 20;
-   uint_t ySize = 20;
+   uint_t const xSize = 20;
+   uint_t const ySize = 20;
 
    // Create blocks
    shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
@@ -80,8 +81,8 @@ void testJacobi2D()
            true, true, true );                 // no periodicity
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID const gpuField = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
 
    // Initialize a quarter of the field with ones, the rest remains 0
    // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
@@ -93,8 +94,8 @@ void testJacobi2D()
             f->get( x, y, 0 ) = real_c(1.0);
    }
 
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+   using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D2Q9>;
+   using Packing = gpu::communication::GPUPackInfo<GPUField> ;
 
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
@@ -107,10 +108,9 @@ void testJacobi2D()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::CudaJacobiKernel2D(gpuField), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
    timeloop.run();
-   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( cpuFieldID );
@@ -120,9 +120,9 @@ void testJacobi2D()
 
 void testJacobi3D()
 {
-   uint_t xSize = 12;
-   uint_t ySize = 12;
-   uint_t zSize = 12;
+   uint_t const xSize = 12;
+   uint_t const ySize = 12;
+   uint_t const zSize = 12;
 
    // Create blocks
    shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
@@ -133,8 +133,8 @@ void testJacobi3D()
            true, true, true );                 // no periodicity
 
 
-   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID const cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID const gpuField = gpu::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
 
    // Initialize a quarter of the field with ones, the rest remains 0
    // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
@@ -147,8 +147,8 @@ void testJacobi3D()
                f->get( x, y, z ) = real_c(1.0);
    }
 
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D3Q7> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+   using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D3Q7>;
+   using Packing = gpu::communication::GPUPackInfo<GPUField>;
 
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
@@ -161,10 +161,9 @@ void testJacobi3D()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::CudaJacobiKernel3D(gpuField), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
    timeloop.run();
-   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField>( cpuFieldID );
@@ -173,7 +172,7 @@ void testJacobi3D()
 
 int main( int argc, char ** argv )
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment const env( argc, argv );
    debug::enterTestMode();
 
    testJacobi2D();
diff --git a/tests/cuda/codegen/CodegenPoissonGPU.cpp b/tests/gpu/codegen/CodegenPoissonGPU.cpp
similarity index 83%
rename from tests/cuda/codegen/CodegenPoissonGPU.cpp
rename to tests/gpu/codegen/CodegenPoissonGPU.cpp
index ef5ae96c00fef1a20c52dab51a1419539f2fd5a4..ece41a7346ca47c52625b6744790ce79110ecf61 100644
--- a/tests/cuda/codegen/CodegenPoissonGPU.cpp
+++ b/tests/gpu/codegen/CodegenPoissonGPU.cpp
@@ -28,11 +28,11 @@
 #include "core/debug/TestSubsystem.h"
 #include "core/math/Constants.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/GPUField.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/FieldIndexing.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/FieldIndexing.h"
 
 #include "field/AddToStorage.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
@@ -47,8 +47,11 @@
 
 using namespace walberla;
 
-typedef GhostLayerField<real_t, 1> ScalarField_T;
-typedef cuda::GPUField<real_t> GPUField;
+using ScalarField_T = GhostLayerField<real_t, 1>;
+using GPUField = gpu::GPUField<real_t>;
+
+using CommScheme = blockforest::communication::UniformBufferedScheme<stencil::D2Q9>;
+using Packing = gpu::communication::GPUPackInfo<GPUField>;
 
 
 // U with Dirichlet Boundary
@@ -104,17 +107,13 @@ void testPoisson()
 
 
    BlockDataID cpuFieldID = field::addToStorage< ScalarField_T >( blocks, "CPU Field src", real_c(0.0) );
-   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField_T>( blocks, cpuFieldID, "GPU Field src" );
+   BlockDataID gpuField = gpu::addGPUFieldToStorage<ScalarField_T>( blocks, cpuFieldID, "GPU Field src" );
    initU( blocks, cpuFieldID );
 
    BlockDataID cpufId = field::addToStorage< ScalarField_T >( blocks, "CPU Field f", real_c(0.0));
-   BlockDataID gpufId = cuda::addGPUFieldToStorage<ScalarField_T>( blocks, cpufId, "GPU Field f" );
+   BlockDataID gpufId = gpu::addGPUFieldToStorage<ScalarField_T>( blocks, cpufId, "GPU Field f" );
    initF( blocks, cpufId );
 
-
-   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
-   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
-
    CommScheme commScheme(blocks);
    commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
 
@@ -126,11 +125,10 @@ void testPoisson()
    timeloop.add() << BeforeFunction(  commScheme, "Communication" )
                   << Sweep( pystencils::PoissonGPU(gpufId, gpuField, dx, dy), "Jacobi Kernel" );
 
-
-   cuda::fieldCpy<GPUField, ScalarField_T>( blocks, gpuField, cpuFieldID );
-   cuda::fieldCpy<GPUField, ScalarField_T>( blocks, gpufId, cpufId );
+   gpu::fieldCpy<GPUField, ScalarField_T>( blocks, gpuField, cpuFieldID );
+   gpu::fieldCpy<GPUField, ScalarField_T>( blocks, gpufId, cpufId );
    timeloop.run();
-   cuda::fieldCpy<ScalarField_T, GPUField>( blocks, cpuFieldID, gpuField );
+   gpu::fieldCpy<ScalarField_T, GPUField>( blocks, cpuFieldID, gpuField );
 
    auto firstBlock = blocks->begin();
    auto f = firstBlock->getData<ScalarField_T>( cpuFieldID );
@@ -140,7 +138,7 @@ void testPoisson()
 
 int main( int argc, char ** argv )
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment const env( argc, argv );
    debug::enterTestMode();
 
    testPoisson();
diff --git a/tests/cuda/codegen/CudaJacobiKernel.py b/tests/gpu/codegen/CudaJacobiKernel.py
similarity index 100%
rename from tests/cuda/codegen/CudaJacobiKernel.py
rename to tests/gpu/codegen/CudaJacobiKernel.py
diff --git a/tests/cuda/codegen/CudaPoisson.py b/tests/gpu/codegen/CudaPoisson.py
similarity index 100%
rename from tests/cuda/codegen/CudaPoisson.py
rename to tests/gpu/codegen/CudaPoisson.py
diff --git a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
similarity index 85%
rename from tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp
rename to tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
index 47c7d6741819e8f71cb2f348e1d21c28a90f1cfc..55bf49b1b1fb158164d5b9a764fd35ae02defaf5 100644
--- a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.cpp
+++ b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.cpp
@@ -28,8 +28,9 @@
 #include "core/debug/TestSubsystem.h"
 #include "core/Environment.h"
 
-#include "cuda/FieldCopy.h"
-#include "cuda/communication/UniformGPUScheme.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/communication/UniformGPUScheme.h"
 
 #include "stencil/D3Q27.h"
 
@@ -42,9 +43,9 @@ namespace walberla {
 
 using Stencil_T = stencil::D3Q27;
 
-cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) {
+gpu::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) {
 
-   return new cuda::GPUField<int> (
+   return new gpu::GPUField<int> (
       storage->getNumberOfXCells( *block ), // number of cells in x direction
       storage->getNumberOfYCells( *block ), // number of cells in y direction
       storage->getNumberOfZCells( *block ), // number of cells in z direction
@@ -54,19 +55,18 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
 
 }
 
-cuda::GPUField<int> * createSmallGPUField( IBlock * const , StructuredBlockStorage * const ) {
-   return new cuda::GPUField<int> (2, 2, 2, 1, 1, field::fzyx );
+gpu::GPUField<int> * createSmallGPUField( IBlock * const , StructuredBlockStorage * const ) {
+   return new gpu::GPUField<int> (2, 2, 2, 1, 1, field::fzyx );
 }
 
 
 void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) {
-
-   cuda::communication::UniformGPUScheme< Stencil_T > us{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us{ sbf, false, false };
    us.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId));
 
    for( auto & block : *sbf ) {
 
-      auto & gpuField = *(block.getData< cuda::GPUField< int > >(gpuFieldId));
+      auto & gpuField = *(block.getData< gpu::GPUField< int > >(gpuFieldId));
 
       field::GhostLayerField< int, 1 > cpuField(gpuField.xSize(), gpuField.ySize(), gpuField.zSize(), 1, 0,
                                                 field::fzyx);
@@ -82,12 +82,12 @@ void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf,
       cpuField(1, 0, 0) = 3;
       cpuField(1, 1, 0) = 4;
 
-      cuda::fieldCpy(gpuField, cpuField);
+      gpu::fieldCpy(gpuField, cpuField);
 
       // communicate
       us.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, +2), 1)
       WALBERLA_CHECK_EQUAL(cpuField(0, 1, +2), 2)
@@ -98,16 +98,15 @@ void testScalarField( std::shared_ptr<blockforest::StructuredBlockForest> & sbf,
 }
 
 void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockForest> & sbf, BlockDataID gpuFieldId ) {
-
-   cuda::communication::UniformGPUScheme< Stencil_T > us1{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us1{ sbf, false, false };
    us1.addPackInfo(std::make_shared< pystencils::ScalarFieldPullReductionGPU >(gpuFieldId));
 
-   cuda::communication::UniformGPUScheme< Stencil_T > us2{ sbf };
+   gpu::communication::UniformGPUScheme< Stencil_T > us2{ sbf, false, false };
    us2.addPackInfo(std::make_shared< pystencils::ScalarFieldCommunicationGPU >(gpuFieldId));
 
    for( auto & block : *sbf ) {
 
-      auto& gpuField = *(block.getData< cuda::GPUField< int > >(gpuFieldId));
+      auto& gpuField = *(block.getData< gpu::GPUField< int > >(gpuFieldId));
 
       field::GhostLayerField< int, 1 > cpuField(gpuField.xSize(), gpuField.ySize(), gpuField.zSize(), 1, 0,
                                                 field::fzyx);
@@ -129,12 +128,12 @@ void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockF
       cpuField(1, 0, 1) = 1;
       cpuField(1, 1, 1) = 1;
 
-      cuda::fieldCpy(gpuField, cpuField);
+      gpu::fieldCpy(gpuField, cpuField);
 
       // communicate pull += reduction
       us1.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       // check values in top ghost layer
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, 2), 0)
@@ -151,7 +150,7 @@ void testScalarFieldPullReduction( std::shared_ptr<blockforest::StructuredBlockF
       // communicate to sync ghost layers
       us2.communicate();
 
-      cuda::fieldCpy(cpuField, gpuField);
+      gpu::fieldCpy(cpuField, gpuField);
 
       // check values in bottom ghost layer
       WALBERLA_CHECK_EQUAL(cpuField(0, 0, -1), 2)
@@ -184,7 +183,7 @@ int main(int argc, char **argv) {
                                         true,true,true); //periodicity
 
    // Create a Field with the same number of cells as the block
-   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> > ( &createGPUField, "ScalarGPUField" );
+   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData< gpu::GPUField<int> > ( &createGPUField, "ScalarGPUField" );
 
    testScalarField( blocks, scalarGPUFieldId );
 
@@ -196,7 +195,7 @@ int main(int argc, char **argv) {
                                    true,true,true);//periodicity
 
    // Create a Field with one quarter as many cells per dimension, i.e. a field with the same size as the one above
-   scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> > ( &createSmallGPUField, "ScalarGPUField" );
+   scalarGPUFieldId = blocks->addStructuredBlockData< gpu::GPUField<int> > ( &createSmallGPUField, "ScalarGPUField" );
 
    testScalarField( blocks, scalarGPUFieldId );
 
diff --git a/tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.py b/tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.py
similarity index 100%
rename from tests/cuda/codegen/GeneratedFieldPackInfoTestGPU.py
rename to tests/gpu/codegen/GeneratedFieldPackInfoTestGPU.py
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp b/tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
similarity index 83%
rename from tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
rename to tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
index 8b03e0277f4ff892d1b948555a8c4af08db3bba0..43b5864062dee2be11bbcd3bc4cb7cfea349c56b 100644
--- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
+++ b/tests/gpu/codegen/MicroBenchmarkGpuLbm.cpp
@@ -25,9 +25,9 @@
 
 #include "field/Field.h"
 
-#include "cuda/GPUField.h"
-#include "cuda/FieldCopy.h"
-#include "cuda/AddGPUFieldToStorage.h"
+#include "gpu/GPUField.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/AddGPUFieldToStorage.h"
 
 #include "MicroBenchmarkCopyKernel.h"
 #include "MicroBenchmarkStreamKernel.h"
@@ -44,10 +44,10 @@ int main( int argc, char **argv )
    shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
            128u, 128u, 128u, 1.0, false, false, false, false);
 
-   BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<real_t> >(blocks, "src", 19, field::fzyx, 1);
-   BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<real_t> >(blocks, "dst", 19, field::fzyx, 1);
+   BlockDataID srcID = gpu::addGPUFieldToStorage< gpu::GPUField<real_t> >(blocks, "src", 19, field::fzyx, 1);
+   BlockDataID dstID = gpu::addGPUFieldToStorage< gpu::GPUField<real_t> >(blocks, "dst", 19, field::fzyx, 1);
 
-   int iterations = 3;
+   int const iterations = 3;
 
    pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
    for( int i=0 ; i < iterations; ++i )
@@ -60,7 +60,7 @@ int main( int argc, char **argv )
       for( auto &block: *blocks )
          stream( &block );
 
-   WALBERLA_CUDA_CHECK(cudaDeviceSynchronize())
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 
    return EXIT_SUCCESS;
 }
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/gpu/codegen/MicroBenchmarkGpuLbm.py
similarity index 100%
rename from tests/cuda/codegen/MicroBenchmarkGpuLbm.py
rename to tests/gpu/codegen/MicroBenchmarkGpuLbm.py
diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bc87aa13f9a2c72351b532fb20e3614cdfc8d82
--- /dev/null
+++ b/tests/gpu/communication/CommTest.cpp
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//
+//======================================================================================================================
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Datatype.h"
+
+#include "field/Field.h"
+#include "field/communication/MPIDatatypes.h"
+
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+
+#define NUM_ITER 100
+#define SIZE_X 16
+#define SIZE_Y 16
+#define SIZE_Z 16
+#define LAYOUT field::fzyx
+
+using namespace walberla;
+
+void hostToHost()
+{
+   Field< double, 1 > const hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0, LAYOUT);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      hostField2.set(hostField1);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void hostToDevice()
+{
+   Field< double, 1 > const hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      gpu::fieldCpy(deviceField, hostField);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void deviceToHost()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+   gpu::fieldCpy(deviceField, hostField);
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      gpu::fieldCpy(hostField, deviceField);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiHostToHost()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiHostToDevice()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype   = mpi::Datatype(field::communication::mpiDatatype(hostField));
+   auto deviceDatatype = mpi::Datatype(field::communication::mpiDatatype(deviceField));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiDeviceToHost()
+{
+   Field< double, 1 > hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype   = mpi::Datatype(field::communication::mpiDatatype(hostField));
+   auto deviceDatatype = mpi::Datatype(field::communication::mpiDatatype(deviceField));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiDeviceToDevice()
+{
+   gpu::GPUField< double > deviceField1(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+   gpu::GPUField< double > deviceField2(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto deviceDatatype1 = mpi::Datatype(field::communication::mpiDatatype(deviceField1));
+   auto deviceDatatype2 = mpi::Datatype(field::communication::mpiDatatype(deviceField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(deviceField1.data(), 1, deviceDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(deviceField2.data(), 1, deviceDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiCopyHostToDevice()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+   gpu::GPUField< double > deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+
+      gpu::fieldCpy(deviceField, hostField2);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+void mpiCopyDeviceToHost()
+{
+   Field< double, 1 > hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT);
+   Field< double, 1 > hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT);
+   gpu::GPUField< double > const deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT);
+
+   auto hostDatatype1 = mpi::Datatype(field::communication::mpiDatatype(hostField1));
+   auto hostDatatype2 = mpi::Datatype(field::communication::mpiDatatype(hostField2));
+
+   double const startTime = MPI_Wtime();
+   for (int i = 0; i < NUM_ITER; ++i)
+   {
+      MPI_Request request2;
+      MPI_Irecv(hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2);
+
+      gpu::fieldCpy(hostField1, deviceField);
+
+      MPI_Request request1;
+      MPI_Isend(hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1);
+
+      MPI_Wait(&request1, MPI_STATUS_IGNORE);
+      MPI_Wait(&request2, MPI_STATUS_IGNORE);
+   }
+   double const endTime = MPI_Wtime();
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   walberla::Environment const walberlaEnv(argc, argv);
+
+   WALBERLA_CHECK_EQUAL(MPIManager::instance()->numProcesses(), 2)
+
+   hostToHost();
+   hostToDevice();
+   deviceToHost();
+   mpiHostToHost();
+   mpiHostToDevice();
+   mpiDeviceToHost();
+   mpiDeviceToDevice();
+   mpiCopyHostToDevice();
+   mpiCopyDeviceToHost();
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
similarity index 61%
rename from tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp
rename to tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
index d4f5f140551ffaf9a995ab0dfe257fb89a19a188..3e79d6263b5feb28a11cc15d6206ba109cd3df5e 100644
--- a/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp
+++ b/tests/gpu/communication/GPUBlockSelectorCommunicationTest.cpp
@@ -14,7 +14,7 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUBlockSelectorCommunicationTest.cpp
-//! \ingroup cuda
+//! \ingroup gpu
 //! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
 //! \brief Short communication test for the usage of block selectors in UniformGPUScheme.
 //
@@ -23,24 +23,26 @@
 #include <blockforest/GlobalLoadBalancing.h>
 #include <blockforest/Initialization.h>
 #include <blockforest/SetupBlockForest.h>
+
 #include <core/DataTypes.h>
+#include <core/Environment.h>
 #include <core/debug/TestSubsystem.h>
 #include <core/math/Random.h>
-#include <core/Environment.h>
-#include <cuda/AddGPUFieldToStorage.h>
-#include <cuda/ErrorChecking.h>
-#include <cuda/FieldCopy.h>
-#include <cuda/GPUField.h>
-#include <cuda/communication/MemcpyPackInfo.h>
-#include <cuda/communication/UniformGPUScheme.h>
-#include <cuda_runtime.h>
+
 #include <domain_decomposition/BlockDataID.h>
+
 #include <field/AddToStorage.h>
 #include <field/GhostLayerField.h>
+
+#include "gpu/GPUWrapper.h"
+#include <gpu/AddGPUFieldToStorage.h>
+#include <gpu/FieldCopy.h>
+#include <gpu/GPUField.h>
+#include <gpu/communication/MemcpyPackInfo.h>
+#include <gpu/communication/UniformGPUScheme.h>
+
 #include <stencil/D3Q27.h>
 #include <stencil/Directions.h>
-#include <stencil/Iterator.h>
-#include <vector>
 
 namespace walberla
 {
@@ -48,20 +50,18 @@ using Type_T = int;
 
 using Stencil_T        = stencil::D3Q27;
 using ScalarField_T    = field::GhostLayerField< Type_T, 1 >;
-using GPUScalarField_T = cuda::GPUField< Type_T >;
+using GPUScalarField_T = gpu::GPUField< Type_T >;
 
 const Set< SUID > requiredBlockSelector("communication");
 const Set< SUID > incompatibleBlockSelector("no communication");
 
-void suidAssignmentFunction( blockforest::SetupBlockForest & forest ) {
-
-   for( auto & sblock : forest ) {
-      if( forest.atDomainXMinBorder( sblock ) ) {
-         sblock.addState(incompatibleBlockSelector);
-      } else {
-         sblock.addState(requiredBlockSelector);
-      }
-      sblock.setWorkload(walberla::numeric_cast<walberla::workload_t>(1));
+void suidAssignmentFunction(blockforest::SetupBlockForest& forest)
+{
+   for (auto& sblock : forest)
+   {
+      if (forest.atDomainXMinBorder(sblock)) { sblock.addState(incompatibleBlockSelector); }
+      else { sblock.addState(requiredBlockSelector); }
+      sblock.setWorkload(walberla::numeric_cast< walberla::workload_t >(1));
    }
 }
 
@@ -70,13 +70,9 @@ void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const Blo
    for (auto& block : *blocks)
    {
       Type_T val;
-      if (blocks->atDomainXMinBorder(block)) {
-         val = Type_T(-1);
-      } else if (blocks->atDomainXMaxBorder(block)) {
-         val = Type_T(1);
-      } else {
-         val = Type_T(0);
-      }
+      if (blocks->atDomainXMinBorder(block)) { val = Type_T(-1); }
+      else if (blocks->atDomainXMaxBorder(block)) { val = Type_T(1); }
+      else { val = Type_T(0); }
 
       auto* field = block.getData< ScalarField_T >(fieldID);
       WALBERLA_ASSERT_NOT_NULLPTR(field)
@@ -90,12 +86,11 @@ void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const Blo
    }
 }
 
-std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
-   const uint_t numberOfXBlocks,             const uint_t numberOfYBlocks,        const uint_t numberOfZBlocks,
-   const uint_t numberOfXCellsPerBlock,      const uint_t numberOfYCellsPerBlock, const uint_t numberOfZCellsPerBlock,
-   const real_t dx,
-   const bool xPeriodic, const bool yPeriodic, const bool zPeriodic,
-   const bool keepGlobalBlockInformation )
+std::shared_ptr< StructuredBlockForest >
+   createSelectorBlockGrid(const uint_t numberOfXBlocks, const uint_t numberOfYBlocks, const uint_t numberOfZBlocks,
+                           const uint_t numberOfXCellsPerBlock, const uint_t numberOfYCellsPerBlock,
+                           const uint_t numberOfZCellsPerBlock, const real_t dx, const bool xPeriodic,
+                           const bool yPeriodic, const bool zPeriodic, const bool keepGlobalBlockInformation)
 {
    // initialize SetupBlockForest = determine domain decomposition
 
@@ -103,20 +98,22 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
 
    sforest.addWorkloadMemorySUIDAssignmentFunction(suidAssignmentFunction);
 
-   AABB domainAABB{ real_c(0), real_c(0), real_c(0),
-                    dx * real_c( numberOfXBlocks * numberOfXCellsPerBlock ),
-                    dx * real_c( numberOfYBlocks * numberOfYCellsPerBlock ),
-                    dx * real_c( numberOfZBlocks * numberOfZCellsPerBlock ) };
+   AABB const domainAABB{ real_c(0),
+                          real_c(0),
+                          real_c(0),
+                          dx * real_c(numberOfXBlocks * numberOfXCellsPerBlock),
+                          dx * real_c(numberOfYBlocks * numberOfYCellsPerBlock),
+                          dx * real_c(numberOfZBlocks * numberOfZCellsPerBlock) };
    sforest.init(domainAABB, numberOfXBlocks, numberOfYBlocks, numberOfZBlocks, xPeriodic, yPeriodic, zPeriodic);
 
    // calculate process distribution
 
    const memory_t memoryLimit = numeric_cast< memory_t >(sforest.getNumberOfBlocks());
 
-   blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+   blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > const metisConfig(
       true, false,
-      std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, numberOfXCellsPerBlock,
-                numberOfYCellsPerBlock, numberOfZCellsPerBlock));
+      std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
+                numberOfXCellsPerBlock, numberOfYCellsPerBlock, numberOfZCellsPerBlock));
 
    sforest.calculateProcessDistribution_Default(uint_c(MPIManager::instance()->numProcesses()), memoryLimit, "hilbert",
                                                 10, false, metisConfig);
@@ -138,40 +135,44 @@ std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   walberla::Environment walberlaEnv(argc, argv);
+   walberla::Environment const walberlaEnv(argc, argv);
 
-   const Vector3<uint_t> nBlocks { 3, 1, 1 };
-   const Vector3<uint_t> cells { 2, 2, 1 };
-   Vector3<real_t> domainSize;
-   for( uint_t d = 0; d < 3; ++d ) {
+   const Vector3< uint_t > nBlocks{ 3, 1, 1 };
+   const Vector3< uint_t > cells{ 2, 2, 1 };
+   Vector3< real_t > domainSize;
+   for (uint_t d = 0; d < 3; ++d)
+   {
       domainSize[d] = real_c(cells[d] * nBlocks[d]);
    }
 
-   auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2],
-                                         cells[0], cells[1], cells[2], 1, false, true, true, true);
+   auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2], cells[0], cells[1], cells[2], 1, false,
+                                         true, true, true);
 
-   BlockDataID fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1));
+   BlockDataID const fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1));
    initScalarField(blocks, fieldID);
 
-   BlockDataID gpuFieldID = cuda::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar");
+   BlockDataID const gpuFieldID = gpu::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar");
 
    // Setup communication schemes for GPUPackInfo
-   cuda::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector);
-   communication.addPackInfo(std::make_shared< cuda::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID));
+   gpu::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector);
+   communication.addPackInfo(std::make_shared< gpu::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID));
 
    // Perform one communication step
    communication();
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
 
    // Copy to CPU
-   cuda::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID );
+   gpu::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID );
 
    // Check for correct data in ghost layers of middle block
-   auto middleBlock = blocks->getBlock( domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2) );
-   auto cpuField = middleBlock->getData<ScalarField_T>(fieldID);
+   auto middleBlock = blocks->getBlock(domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2));
+   auto cpuField    = middleBlock->getData< ScalarField_T >(fieldID);
    WALBERLA_ASSERT_NOT_NULLPTR(cpuField)
-   
+
    // avoid unused variable warning in release mode
    (void) cpuField;
+   // WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(cpuField, WALBERLA_LOG_DEVEL_VAR(cpuField->get(x, y, z)))
 
    // check for missing communication with left neighbour (first block, incompatible selector)
    WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 0, 0), 0, "Communication with left neighbor detected.")
diff --git a/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0e41c1081e306cad53a5d0e3f04187acbb18b95
--- /dev/null
+++ b/tests/gpu/communication/GPUPackInfoCommunicationTest.cpp
@@ -0,0 +1,165 @@
+//========================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup gpu
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple
+//! streams.
+//
+//========================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/Random.h"
+#include "core/mpi/Environment.h"
+
+#include "domain_decomposition/BlockDataID.h"
+
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q27.h"
+#include "stencil/Directions.h"
+#include "stencil/Iterator.h"
+
+#include <vector>
+
+#include "gpu/ErrorChecking.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/communication/GPUPackInfo.h"
+
+using namespace walberla;
+
+using DataType        = walberla::uint_t;
+using StencilType     = stencil::D3Q27;
+using FieldType       = field::GhostLayerField< DataType, StencilType::Size >;
+using GPUFieldType    = gpu::GPUField< DataType >;
+using CommSchemeType  = blockforest::communication::UniformBufferedScheme< StencilType >;
+using GPUPackInfoType = gpu::communication::GPUPackInfo< GPUFieldType >;
+
+static std::vector< gpu::Layout > fieldLayouts = { gpu::fzyx, gpu::zyxf };
+static uint_t fieldLayoutIndex                 = 0;
+
+FieldType* createField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new FieldType(storage->getNumberOfXCells(*block),                  // number of cells in x direction per block
+                        storage->getNumberOfYCells(*block),                  // number of cells in y direction per block
+                        storage->getNumberOfZCells(*block),                  // number of cells in z direction per block
+                        1,                                                   // one ghost layer
+                        DataType(0),                                         // initial value
+                        fieldLayouts[fieldLayoutIndex],                      // layout
+                        make_shared< gpu::HostFieldAllocator< DataType > >() // allocator for host pinned memory
+   );
+}
+
+GPUFieldType* createGPUField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new GPUFieldType(storage->getNumberOfXCells(*block), // number of cells in x direction
+                           storage->getNumberOfYCells(*block), // number of cells in y direction
+                           storage->getNumberOfZCells(*block), // number of cells in z direction
+                           StencilType::Size,                  // number of cells for pdfs
+                           1,                                  // one ghost layer
+                           fieldLayouts[fieldLayoutIndex]);
+}
+
+void initFields(const shared_ptr< StructuredBlockStorage >& blocks, const BlockDataID& fieldID)
+{
+   for (auto block = blocks->begin(); block != blocks->end(); ++block)
+   {
+      auto fieldPtr = block->getData< FieldType >(fieldID);
+
+      for (auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt)
+         *fieldIt = math::intRandom< DataType >();
+   }
+}
+
+int main(int argc, char** argv)
+{
+   debug::enterTestMode();
+   mpi::Environment mpiEnv(argc, argv);
+
+   const Vector3< uint_t > cells = Vector3< uint_t >(4, 4, 4);
+
+   uint_t nProc = uint_c(MPIManager::instance()->numProcesses());
+
+   for (; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex)
+   {
+      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
+                                                        cells[0], cells[1], cells[2], // cells
+                                                        1,                            // unit cell spacing
+                                                        true,                         // one block per process
+                                                        true, true, true);            // periodic in all directions
+
+      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >(&createField, "ScalarField");
+
+      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >(&createGPUField, "syncGPUField");
+
+      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >(&createGPUField, "asyncGPUField");
+
+      math::seedRandomGenerator(numeric_cast< std::mt19937::result_type >(MPIManager::instance()->rank()));
+      // Initialize CPU field with random values
+      initFields(blocks, sourceFieldId);
+
+      // Copy same CPU field to both GPU fields
+      for (auto block = blocks->begin(); block != blocks->end(); ++block)
+      {
+         auto sourceFieldPtr = block->getData< FieldType >(sourceFieldId);
+
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >(syncGPUFieldId);
+         gpu::fieldCpy(*syncGPUFieldPtr, *sourceFieldPtr);
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >(asyncGPUFieldId);
+         gpu::fieldCpy(*asyncGPUFieldPtr, *sourceFieldPtr);
+      }
+
+      // Setup communication schemes for synchronous GPUPackInfo
+      CommSchemeType syncCommScheme(blocks);
+      syncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(syncGPUFieldId));
+
+      // Setup communication scheme for asynchronous GPUPackInfo, which uses GPU streams
+      CommSchemeType asyncCommScheme(blocks);
+      asyncCommScheme.addPackInfo(make_shared< GPUPackInfoType >(asyncGPUFieldId));
+
+      // Perform one communication step for each scheme
+      syncCommScheme();
+      asyncCommScheme();
+
+      // Check results
+      FieldType syncFieldCpu(cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                             make_shared< gpu::HostFieldAllocator< DataType > >());
+      FieldType asyncFieldCpu(cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                              make_shared< gpu::HostFieldAllocator< DataType > >());
+
+      for (auto block = blocks->begin(); block != blocks->end(); ++block)
+      {
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >(syncGPUFieldId);
+         gpu::fieldCpy(syncFieldCpu, *syncGPUFieldPtr);
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >(asyncGPUFieldId);
+         gpu::fieldCpy(asyncFieldCpu, *asyncGPUFieldPtr);
+
+         for (auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
+              syncIt != syncFieldCpu.end(); ++syncIt, ++asyncIt)
+            WALBERLA_CHECK_EQUAL(*syncIt, *asyncIt)
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/gpu/communication/GPUPackInfoTest.cpp b/tests/gpu/communication/GPUPackInfoTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0a9d87fd06f7d261b09942c7d69bed189e60177
--- /dev/null
+++ b/tests/gpu/communication/GPUPackInfoTest.cpp
@@ -0,0 +1,177 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup gpu
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \brief Tests if a GPUField is correctly packed into buffers
+//
+//======================================================================================================================
+
+#include "gpu/communication/GPUPackInfo.h"
+
+#include "blockforest/Initialization.h"
+
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/MPIManager.h"
+
+#include "field/GhostLayerField.h"
+
+#include "stencil/D3Q27.h"
+
+#include <cstring>
+#include <vector>
+
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUField.h"
+
+#define F_SIZE 19
+
+using namespace walberla;
+
+static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
+static uint_t fieldLayoutIndex                   = 0;
+
+gpu::GPUField< int >* createGPUField(IBlock* const block, StructuredBlockStorage* const storage)
+{
+   return new gpu::GPUField< int >(storage->getNumberOfXCells(*block), // number of cells in x direction
+                                   storage->getNumberOfYCells(*block), // number of cells in y direction
+                                   storage->getNumberOfZCells(*block), // number of cells in z direction
+                                   F_SIZE,                             // fSize
+                                   1,                                  // number of ghost layers
+                                   fieldLayouts[fieldLayoutIndex]);
+}
+
+// Tester base class. The communicate() template method allows testing different communication methods.
+class GPUPackInfoTester
+{
+ public:
+   using GPUPackInfoType = gpu::communication::GPUPackInfo< gpu::GPUField< int > >;
+
+   GPUPackInfoTester(IBlock* block, BlockDataID fieldId) : block_(block), fieldId_(fieldId) {}
+
+   virtual ~GPUPackInfoTester() = default;
+
+   void test(stencil::Direction dir)
+   {
+      gpu::GPUField< int >& gpuField = *(block_->getData< gpu::GPUField< int > >(fieldId_));
+
+      field::GhostLayerField< int, F_SIZE > cpuField(gpuField.xSize(), // number of cells in x direction
+                                                     gpuField.ySize(), // number of cells in y direction
+                                                     gpuField.zSize(), // number of cells in z direction
+                                                     1,                // number of ghost layers
+                                                     0,                // initial value
+                                                     fieldLayouts[fieldLayoutIndex]);
+      cpuField.setWithGhostLayer(0);
+
+      int val = 0;
+      for (auto it = cpuField.beginSliceBeforeGhostLayer(dir); it != cpuField.end(); ++it)
+      {
+         *it = ++val;
+      }
+      gpu::fieldCpy(gpuField, cpuField);
+
+      GPUPackInfoType gpuPackInfo(fieldId_);
+
+      communicate(gpuPackInfo, dir);
+      gpu::fieldCpy(cpuField, gpuField);
+
+      val = 0;
+      for (auto it = cpuField.beginGhostLayerOnly(stencil::inverseDir[dir]); it != cpuField.end(); ++it)
+      {
+         WALBERLA_CHECK_EQUAL(*it, ++val)
+      }
+   }
+
+ protected:
+   virtual void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) = 0;
+
+   IBlock* block_;
+   BlockDataID fieldId_;
+};
+
+// Tester for buffer communication
+class GPUPackInfoBufferTester : public GPUPackInfoTester
+{
+ public:
+   GPUPackInfoBufferTester(IBlock* block, BlockDataID fieldId) : GPUPackInfoTester(block, fieldId) {}
+
+ protected:
+   void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) override
+   {
+      mpi::GenericSendBuffer<> sendBuf;
+      sendBuf.addDebugMarker("Be");
+      gpuPackInfo.packData(block_, dir, sendBuf);
+      sendBuf.addDebugMarker("Af");
+
+      // Manually copy over the send to the receive buffer
+      mpi::GenericRecvBuffer<> recvBuf;
+      recvBuf.resize(sendBuf.size());
+      memcpy(recvBuf.ptr(), sendBuf.ptr(), sendBuf.size() * sizeof(mpi::GenericSendBuffer<>::ElementType));
+
+      recvBuf.readDebugMarker("Be");
+      gpuPackInfo.unpackData(block_, stencil::inverseDir[dir], recvBuf);
+      recvBuf.readDebugMarker("Af");
+   }
+};
+
+// Tester for local communication
+class GPUPackInfoLocalTester : public GPUPackInfoTester
+{
+ public:
+   GPUPackInfoLocalTester(IBlock* block, BlockDataID fieldId) : GPUPackInfoTester(block, fieldId) {}
+
+ protected:
+   void communicate(GPUPackInfoType& gpuPackInfo, stencil::Direction dir) override
+   {
+      gpuPackInfo.communicateLocal(block_, block_, dir);
+   }
+};
+
+int main(int argc, char** argv)
+{
+   using blockforest::createUniformBlockGrid;
+
+   debug::enterTestMode();
+   MPIManager::instance()->initializeMPI(&argc, &argv);
+
+   for (; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex)
+   {
+      // Create BlockForest
+      uint_t const processes = uint_c(MPIManager::instance()->numProcesses());
+      auto blocks            = createUniformBlockGrid(processes, 1, 1,   // blocks
+                                                      2, 2, 2,           // cells
+                                                      1,                 // dx
+                                                      false,             // one block per process
+                                                      true, true, true); // periodicity
+
+      BlockDataID const scalarGPUFieldId =
+         blocks->addStructuredBlockData< gpu::GPUField< int > >(&createGPUField, "ScalarGPUField");
+
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         GPUPackInfoBufferTester bufferTester(&(*blockIt), scalarGPUFieldId);
+         GPUPackInfoLocalTester localTester(&(*blockIt), scalarGPUFieldId);
+
+         for (auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir)
+         {
+            localTester.test(*dir);
+            bufferTester.test(*dir);
+         }
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/gui/SimpleGuiRun.cpp b/tests/gui/SimpleGuiRun.cpp
index cb8d39af582fadb2b05bf8dcfaac1bda22dad72f..36ca045c43a9cecee878925fb5f5be0aec0cc73d 100644
--- a/tests/gui/SimpleGuiRun.cpp
+++ b/tests/gui/SimpleGuiRun.cpp
@@ -73,8 +73,8 @@ int main(int argc, char **argv )
 
    BlockDataID pdfField     = lbm::addPdfFieldToStorage( blocks, "PdfField", latticeModel );
 
-   BlockDataID scalarField1 = field::addToStorage<ScalarField>( blocks, "ScalarFieldOneGl", real_t(0), field::zyxf,  1 );
-   BlockDataID scalarField2 = field::addToStorage<ScalarField>( blocks, "ScalarFieldTwoGl", real_t(0), field::zyxf,  2 );
+   BlockDataID scalarField1 = field::addToStorage<ScalarField>( blocks, "ScalarFieldOneGl", real_t(0), field::fzyx,  1 );
+   BlockDataID scalarField2 = field::addToStorage<ScalarField>( blocks, "ScalarFieldTwoGl", real_t(0), field::fzyx,  2 );
    BlockDataID vectorField  = field::addToStorage<VectorField>( blocks, "VectorField", Vector3<real_t>(0,0,0) );
    BlockDataID flagField    = field::addFlagFieldToStorage<FField>( blocks, "FlagField" );
 
diff --git a/tests/lbm/codegen/GeneratedOutflowBC.cpp b/tests/lbm/codegen/GeneratedOutflowBC.cpp
index 453a1e7cc58a23c106f565ec28a7b411a23c2370..3226802ae74d94843d9ed9199c5205acbb0602c2 100644
--- a/tests/lbm/codegen/GeneratedOutflowBC.cpp
+++ b/tests/lbm/codegen/GeneratedOutflowBC.cpp
@@ -70,12 +70,12 @@ Vector3< real_t > ShearProfile::operator()( const Cell& pos, const shared_ptr< S
 {
    Cell globalCell;
    CellInterval domain = SbF->getDomainCellBB();
-   real_t h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
+   real_t const h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
    SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
 
-   real_t u = inflow_velocity_ * (real_c(globalCell[1]) / h_y);
+   real_t const u = inflow_velocity_ * (real_c(globalCell[1]) / h_y);
 
-   Vector3< real_t > result(u, 0.0, 0.0);
+   Vector3< real_t > const result(u, 0.0, 0.0);
    return result;
 }
 
@@ -96,15 +96,12 @@ int main(int argc, char** argv)
    const real_t u_max     = parameters.getParameter< real_t >("u_max", real_c(0.05));
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
-
    // create fields
    BlockDataID pdfFieldID     = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs");
    BlockDataID velFieldID     = field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
-   BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
+   BlockDataID const densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
 
-   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   BlockDataID const flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
    pystencils::GeneratedOutflowBC_MacroSetter setterSweep(pdfFieldID, velFieldID);
    for (auto& block : *blocks)
@@ -115,7 +112,7 @@ int main(int argc, char** argv)
 
    auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
 
-   ShearProfile velocityCallback{u_max};
+   ShearProfile const velocityCallback{u_max};
    std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
       velocity_initialisation = velocityCallback;
 
@@ -148,12 +145,8 @@ int main(int argc, char** argv)
    timeloop.add() << Sweep(outflow, "outflow boundary");
    timeloop.add() << Sweep(UpdateSweep, "LB stream & collide");
 
-   // log remaining time
-   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-                                 "remaining time logger");
-
    // VTK Writer
-   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
    if (vtkWriteFrequency > 0)
    {
       auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "GeneratedOutflowBC_VTK", vtkWriteFrequency, 0, false,
@@ -170,7 +163,7 @@ int main(int argc, char** argv)
    timeloop.run();
 
    CellInterval domain = blocks->getDomainCellBB();
-   real_t h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
+   real_t const h_y          = real_c(domain.yMax()) - real_c(domain.yMin());
    for (auto& block : *blocks)
    {
       auto velField = block.getData<VelocityField_T>(velFieldID);
diff --git a/tests/lbm/codegen/LbCodeGenerationExample.cpp b/tests/lbm/codegen/LbCodeGenerationExample.cpp
index 4711fb1b91a024bde929ed19b44fca2dd9e018c0..99087e897c3b9db25ac7524cf95cd6e12a666ec1 100644
--- a/tests/lbm/codegen/LbCodeGenerationExample.cpp
+++ b/tests/lbm/codegen/LbCodeGenerationExample.cpp
@@ -68,8 +68,8 @@ int main(int argc, char** argv)
       parameters.getParameter< Vector3< real_t > >("initialVelocity", Vector3< real_t >());
    const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
 
-   const double remainingTimeLoggerFrequency =
-      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+   const real_t remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
 
    // create fields
    BlockDataID forceFieldId = field::addToStorage< VectorField_T >(blocks, "Force", real_c(0.0), field::fzyx);
diff --git a/tests/lbm/diff_packinfos.sh b/tests/lbm/diff_packinfos.sh
index bfa89c5ef63477c61fefac60b7767fe22aaf4233..074d31492dbc1dd2cc0f47bc059ab5d181117f22 100755
--- a/tests/lbm/diff_packinfos.sh
+++ b/tests/lbm/diff_packinfos.sh
@@ -2,5 +2,5 @@
 
 REGEX='^((#include)|(void)|(uint_t))'
 cd default_codegen
-diff -u -B <(grep -vP "$REGEX" FromKernelPackInfoPull.cpp)  <(grep -vP "$REGEX" AccessorBasedPackInfoEven.cpp) || exit 1
-diff -u -B <(grep -vP "$REGEX" FromKernelPackInfoPush.cpp)  <(grep -vP "$REGEX" AccessorBasedPackInfoOdd.cpp) || exit 1
+diff -u -B <(tail -n +20 FromKernelPackInfoPull.cpp | grep -vP "$REGEX")  <(tail -n +20 AccessorBasedPackInfoEven.cpp | grep -vP "$REGEX") || exit 1
+diff -u -B <(tail -n +20 FromKernelPackInfoPush.cpp | grep -vP "$REGEX")  <(tail -n +20 AccessorBasedPackInfoOdd.cpp | grep -vP "$REGEX") || exit 1
diff --git a/tests/lbm/field/QCriterionTest.cpp b/tests/lbm/field/QCriterionTest.cpp
index 0e5dc2dd042ccd78ee0e983b94f525903c28f1b8..966b0e67882241dd00ee6d4c566e3cc2f82d4f4c 100644
--- a/tests/lbm/field/QCriterionTest.cpp
+++ b/tests/lbm/field/QCriterionTest.cpp
@@ -54,7 +54,7 @@ int main( int argc, char ** argv )
 
    auto numberOfCells = uint_t(40);
 
-   VelocityField_T velocityField(numberOfCells, numberOfCells, numberOfCells, uint_t(1));
+   VelocityField_T velocityField(numberOfCells, numberOfCells, numberOfCells, uint_t(1), field::zyxf);
 
    FluidFilter_T filter(numberOfCells);
 
diff --git a/tests/lbm/refinement/Uniformity.cpp b/tests/lbm/refinement/Uniformity.cpp
index 7ff59cbbb954c4444971e72ddcbfa1fae9162e9b..dbad16dcba2a6f16c6a2525be82b39d9709e0fb2 100644
--- a/tests/lbm/refinement/Uniformity.cpp
+++ b/tests/lbm/refinement/Uniformity.cpp
@@ -394,7 +394,7 @@ int main( int argc, char ** argv )
    // check constant velocity
 
    //typedef GhostLayerField<real_t,1> ErrorField;
-   //BlockDataID errorFieldId = field::addToStorage< ErrorField >( blocks, "error field", real_t(0), field::zyxf, FieldGhostLayers );
+   //BlockDataID errorFieldId = field::addToStorage< ErrorField >( blocks, "error field", real_t(0), field::fzyx, FieldGhostLayers );
 
    for( auto block = blocks->begin(); block != blocks->end(); ++block )
    {
diff --git a/tests/lbm_generated/CMakeLists.txt b/tests/lbm_generated/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d7a7ef76bd6b9a605d19cd7e2bbdf4bedddbed7b
--- /dev/null
+++ b/tests/lbm_generated/CMakeLists.txt
@@ -0,0 +1,21 @@
+#############################################################################################################################
+#
+# Tests for generated lbm module
+#
+#############################################################################################################################
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+
+waLBerla_generate_target_from_python(NAME ExampleGenerated
+        FILE Example.py
+        OUT_FILES LBMStorageSpecification.h LBMStorageSpecification.cpp
+        LBMSweepCollection.h LBMSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        LBMBoundaryCollection.h
+        Example_InfoHeader.h)
+waLBerla_compile_test( FILES Example.cpp DEPENDS ExampleGenerated blockforest field lbm_generated timeloop )
+
+if( WALBERLA_DOUBLE_ACCURACY )
+waLBerla_compile_test( FILES LDC.cpp DEPENDS blockforest field lbm_generated timeloop )
+endif()
diff --git a/tests/lbm_generated/Example.cpp b/tests/lbm_generated/Example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4dfd69b553d88d268efb0c49c857eb391f6277ea
--- /dev/null
+++ b/tests/lbm_generated/Example.cpp
@@ -0,0 +1,233 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Example.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/all.h"
+
+#include "core/all.h"
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+
+#include "geometry/all.h"
+
+#include "timeloop/all.h"
+
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+
+// include the generated header file. It includes all generated classes
+#include "Example_InfoHeader.h"
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::LBMStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+using PackInfo_T             = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >;
+
+using SweepCollection_T = lbm::LBMSweepCollection;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t               = walberla::uint8_t;
+using FlagField_T          = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::LBMBoundaryCollection< FlagField_T >;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+class LDCRefinement
+{
+ public:
+   LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest)
+   {
+      std::vector< SetupBlock* > blocks;
+      forest.getBlocks(blocks);
+
+      for (auto b : blocks)
+      {
+         if (forest.atDomainZMaxBorder(*b))
+         {
+            if (b->getLevel() < refinementDepth_) { b->setMarker(true); }
+         }
+      }
+   }
+
+ private:
+   const uint_t refinementDepth_;
+};
+
+class LDC
+{
+ public:
+   LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); }
+
+   RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         Block& b           = dynamic_cast< Block& >(*bIt);
+         uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+};
+
+static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& setup)
+{
+   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
+   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
+   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+
+   auto refSelection = setup.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), uint_c(MPIManager::instance()->numProcesses()));
+}
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   // read parameters
+   auto domainSetup = walberlaEnv.config()->getOneBlock("DomainSetup");
+   auto parameters  = walberlaEnv.config()->getOneBlock("Parameters");
+
+   auto omega           = parameters.getParameter< real_t >("omega", real_c(1.4));
+   auto timesteps       = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+   auto refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
+
+   auto remainingTimeLoggerFrequency =
+      parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   auto flowSetup = std::make_shared< LDC >(refinementDepth);
+
+   SetupBlockForest setupBfs;
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+   createSetupBlockForest(setupBfs, domainSetup, *flowSetup);
+   // domainSetup
+
+   // Create structured block forest
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
+   auto bfs    = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+   blocks->createCellBoundingBoxes();
+
+   WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+   for (uint_t level = 0; level <= refinementDepth; level++)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << setupBfs.getNumberOfBlocks(level))
+   }
+
+   StorageSpecification_T StorageSpec = StorageSpecification_T();
+   BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(2));
+   BlockDataID velFieldId = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(3));
+
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, velFieldId, omega);
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block);
+   }
+
+   const FlagUID fluidFlagUID("Fluid");
+   flowSetup->setupBoundaryFlagField(*blocks, flagFieldId);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID, 2);
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+   auto comm =
+      std::make_shared< blockforest::communication::NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+   auto packInfo = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldId);
+   comm->addPackInfo(packInfo);
+
+   lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > timestep(
+      blocks, pdfFieldId, sweepCollection, boundaryCollection, comm, packInfo);
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+   uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "ExampleVTK", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      vtkOutput->addBeforeFunction([&]() {
+         for (auto& block : *blocks)
+         {
+            sweepCollection.calculateMacroscopicParameters(&block);
+         }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+   timeloop.addFuncAfterTimeStep(timestep);
+
+   // log remaining time
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps")
+
+   timeloop.run();
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/Example.prm b/tests/lbm_generated/Example.prm
new file mode 100644
index 0000000000000000000000000000000000000000..1957b362e4b77a94fb3d3c68b6e9d33b0efb3e6f
--- /dev/null
+++ b/tests/lbm_generated/Example.prm
@@ -0,0 +1,30 @@
+
+Parameters 
+{
+	omega           1.95;
+	timesteps       3000;
+	refinementDepth 1;
+
+	remainingTimeLoggerFrequency 3; // in seconds
+	vtkWriteFrequency 500;
+}
+
+DomainSetup
+{
+   domainSize    <64, 64, 64>;
+   rootBlocks    <4, 4, 4>;
+
+   cellsPerBlock <  16, 16, 16 >;
+   periodic      <  0,    1, 0 >;
+}
+
+Boundaries 
+{
+
+	Border { direction W;    walldistance -1;  flag NoSlip; }
+	Border { direction E;    walldistance -1;  flag NoSlip; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+    Border { direction T;    walldistance -1;  flag NoSlip; }
+    Border { direction B;    walldistance -1;  flag NoSlip; }
+}
diff --git a/tests/lbm_generated/Example.py b/tests/lbm_generated/Example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5233639be24c6574cee6440300bfe73e22e5e2ae
--- /dev/null
+++ b/tests/lbm_generated/Example.py
@@ -0,0 +1,48 @@
+import sympy as sp
+
+from pystencils import Target
+from pystencils import fields
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+import warnings
+
+warnings.filterwarnings("ignore")
+with CodeGeneration() as ctx:
+    target = Target.CPU  # Target.GPU if ctx.cuda else Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'esotwist'
+    timesteps = get_timesteps(streaming_pattern)
+
+    omega = sp.symbols("omega")
+
+    stencil = LBStencil(Stencil.D3Q19)
+    pdfs, vel_field = fields(f"pdfs({stencil.Q}), velocity({stencil.D}): {data_type}[{stencil.D}D]", layout='fzyx')
+
+    macroscopic_fields = {'velocity': vel_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=data_type))
+
+    generate_lbm_package(ctx, name="LBM",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields)
+
+    generate_info_header(ctx, 'Example_InfoHeader')
diff --git a/tests/lbm_generated/LDC.cpp b/tests/lbm_generated/LDC.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6df6d45a3e1cf2915e3b077a83ee77c40668fff7
--- /dev/null
+++ b/tests/lbm_generated/LDC.cpp
@@ -0,0 +1,136 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LDC.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/all.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/all.h"
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+#include "geometry/all.h"
+#include "timeloop/all.h"
+
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+
+#include "lbm_generated/storage_specification/D3Q19StorageSpecification.h"
+#include "lbm_generated/sweep_collection/D3Q19SRT.h"
+#include "lbm_generated/boundary/D3Q19BoundaryCollection.h"
+
+
+using namespace walberla;
+using namespace std::placeholders;
+
+using StorageSpecification_T = lbm::D3Q19StorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+using PdfField_T             = lbm_generated::PdfField< StorageSpecification_T >;
+
+using SweepCollection_T = lbm::D3Q19SRT;
+
+using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
+using ScalarField_T = GhostLayerField< real_t, 1 >;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+using BoundaryCollection_T = lbm::D3Q19BoundaryCollection< FlagField_T >;
+
+using blockforest::communication::UniformBufferedScheme;
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   // read parameters
+   auto parameters = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
+   const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10)) + uint_c(1);
+
+   const double remainingTimeLoggerFrequency =
+      parameters.getParameter< double >("remainingTimeLoggerFrequency", real_c(3.0)); // in seconds
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   StorageSpecification_T const StorageSpec = StorageSpecification_T();
+   BlockDataID const pdfFieldId  = lbm_generated::addPdfFieldToStorage(blocks, "pdf field", StorageSpec, uint_c(1), field::fzyx);
+   BlockDataID const velFieldId  = field::addToStorage< VectorField_T >(blocks, "Velocity", real_c(0.0), field::fzyx);
+   BlockDataID const densityFieldId  = field::addToStorage< ScalarField_T >(blocks, "density", real_c(0.0), field::fzyx);
+   BlockDataID const flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field", uint_c(1));
+
+   const FlagUID fluidFlagUID("Fluid");
+
+   auto boundariesConfig   = walberlaEnv.config()->getBlock("Boundaries");
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   BoundaryCollection_T boundaryCollection(blocks, flagFieldId, pdfFieldId, fluidFlagUID, real_c(1.0), real_c(0.05), real_c(0.0), real_c(0.0));
+   SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega);
+
+   for (auto& block : *blocks)
+   {
+      sweepCollection.initialise(&block);
+   }
+
+   auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId);
+   UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(packInfo);
+
+   SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+   timeLoop.add() << BeforeFunction(communication, "communication")
+                  << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+   timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+   //
+   auto vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+   if (vtkWriteFrequency > 0)
+   {
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "ExampleVTK", vtkWriteFrequency, 0, false, "vtk_out",
+                                                      "simulation_step", false, true, true, false, 0);
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velFieldId, "velocity");
+      auto densWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldId, "density");
+      vtkOutput->addBeforeFunction([&](){
+      for (auto& block : *blocks)
+      {
+         sweepCollection.calculateMacroscopicParameters(&block);
+      }
+      });
+
+      vtkOutput->addCellDataWriter(velWriter);
+      vtkOutput->addCellDataWriter(densWriter);
+
+      timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   // log remaining time
+   timeLoop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation with " << timesteps << " timesteps")
+
+   timeLoop.run();
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/lbm_generated/LDC.prm b/tests/lbm_generated/LDC.prm
new file mode 100644
index 0000000000000000000000000000000000000000..4ba435d1b027eee3f9a066e9a9e39aa5e1ec831f
--- /dev/null
+++ b/tests/lbm_generated/LDC.prm
@@ -0,0 +1,28 @@
+
+Parameters 
+{
+	omega           1.95;
+	timesteps       3000;
+
+	remainingTimeLoggerFrequency 3; // in seconds
+	vtkWriteFrequency 500;
+}
+
+DomainSetup
+{
+   Blocks    <4, 4, 4>;
+   cellsPerBlock <  32, 32, 32 >;
+
+   periodic      <  0,    1, 0 >;
+}
+
+Boundaries 
+{
+
+	Border { direction W;    walldistance -1;  flag NoSlip; }
+	Border { direction E;    walldistance -1;  flag NoSlip; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+    Border { direction T;    walldistance -1;  flag NoSlip; }
+    Border { direction B;    walldistance -1;  flag NoSlip; }
+}
diff --git a/tests/lbm_mesapd_coupling/mapping/ParticleMapping.cpp b/tests/lbm_mesapd_coupling/mapping/ParticleMapping.cpp
index b2ecf7107f455b229cde21f9f6c22742c10aec00..c71ce6cf694a90116b91393081baf2e0194fe3df 100644
--- a/tests/lbm_mesapd_coupling/mapping/ParticleMapping.cpp
+++ b/tests/lbm_mesapd_coupling/mapping/ParticleMapping.cpp
@@ -410,9 +410,9 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3<real_t>(real_t(0)), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp
index d6c317521cd63cde70f3c94c237ea23ecf600525..ca976cfc719ca61bfd59d7ec029eb9754329fe1a 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/DragForceSphere.cpp
@@ -479,15 +479,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ), ForceModel_T( Vector3<real_t> ( setup.extForce, 0, 0 ) ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp
index 6034afaef3ebdb17e66340885e2766a2f979dd68..8956b4988c18b81d812e6386f20b995169535d2b 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/ForceBetweenTwoStationaryObjects.cpp
@@ -216,13 +216,13 @@ void createSimulationSetup( shared_ptr< StructuredBlockForest > blocks, shared_p
    LM_T latticeModel = LM_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LM_T >( blocks, "pdf field (zyxf)", latticeModel, velocity, real_t(1), uint_t(1), field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LM_T >( blocks, "pdf field (fzyx)", latticeModel, velocity, real_t(1), uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = typename MyBoundaryHandling<LM_T,ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp
index a390f6feb62535af2d6edf140244649eecf9ba55..6a60939360ed1afe4fe4d826b571e0432756a08e 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.cpp
@@ -341,9 +341,9 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3<real_t>(real_t(0)), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
@@ -363,7 +363,7 @@ int main( int argc, char **argv )
    const real_t overlap = real_t( 1.5 ) * dx;
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp
index 69a47439ead6d2e5cca083629c8343aab8e8c3c4..665ddb12f41259f5025eace600f0d084b49c5aee 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/PdfReconstruction.cpp
@@ -557,9 +557,9 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          velocity, density,
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
@@ -573,7 +573,7 @@ int main( int argc, char **argv )
    auto accessor = make_shared<ParticleAccessor_T>(ps, ss);
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp
index 602011cefdf119b4fa7d2505e30a6b10f6d23c96..769088479e0b0963df7b02b4976a1d03bd6f9228 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/SettlingSphere.cpp
@@ -564,14 +564,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) / relaxationTime ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp b/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp
index 4ff4bc3d81281bf32d618bafe5944a9e214453a3..de677b8fd4d02f1800a26ba12d077f72ed592286 100644
--- a/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp
+++ b/tests/lbm_mesapd_coupling/momentum_exchange_method/UpdateParticleMapping.cpp
@@ -288,9 +288,9 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3<real_t>(real_t(0)), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
@@ -304,7 +304,7 @@ int main( int argc, char **argv )
    auto planeShape = ss->create<mesa_pd::data::HalfSpace>( Vector3<real_t>(real_t(1), real_t(0), real_t(0)) );
 
    // add particle field
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp
index fc7f56dd32fa0d6f126fd723c8b6ba24be3f354d..151b109c07e083b7d5adc55cb28a1dd0fce0e4eb 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/DragForceSphere.cpp
@@ -398,7 +398,7 @@ int main(int argc, char** argv)
 
    // add PDF field
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(
-      blocks, "pdf field (zyxf)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::zyxf);
+      blocks, "pdf field (fzyx)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::fzyx);
 
    ///////////////
    // TIME LOOP //
@@ -416,7 +416,7 @@ int main(int argc, char** argv)
    BlockDataID particleAndVolumeFractionFieldID =
       field::addToStorage< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
          blocks, "particle and volume fraction field",
-         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::zyxf, 0);
+         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::fzyx, 0);
    lbm_mesapd_coupling::psm::ParticleAndVolumeFractionMapping particleMapping(
       blocks, accessor, lbm_mesapd_coupling::GlobalParticlesSelector(), particleAndVolumeFractionFieldID, 4);
    particleMapping();
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp
index eb03d3537911a899d2022a305c9a2780aa5595d1..d1bd029a7fa4109d2390b14f93744d2b8d6b4d75 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/ParticleAndVolumeFractionMapping.cpp
@@ -179,7 +179,7 @@ int main(int argc, char** argv)
    BlockDataID particleAndVolumeFractionFieldID =
       field::addToStorage< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
          blocks, "particle and volume fraction field",
-         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::zyxf, 0);
+         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::fzyx, 0);
 
    // calculate fraction
    lbm_mesapd_coupling::psm::ParticleAndVolumeFractionMapping particleMapping(
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp
index 0964325f3cc154ccdf754348e9eac24c32789aa4..8144d27523f2d72c57bc08e735beb2f8f8d6adf7 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/SettlingSphere.cpp
@@ -581,7 +581,7 @@ int main(int argc, char** argv)
 
    // add PDF field
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(
-      blocks, "pdf field (zyxf)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::zyxf);
+      blocks, "pdf field (fzyx)", latticeModel, Vector3< real_t >(real_t(0)), real_t(1), uint_t(1), field::fzyx);
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
@@ -630,7 +630,7 @@ int main(int argc, char** argv)
    BlockDataID particleAndVolumeFractionFieldID =
       field::addToStorage< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
          blocks, "particle and volume fraction field",
-         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::zyxf, 0);
+         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::fzyx, 0);
    lbm_mesapd_coupling::psm::ParticleAndVolumeFractionMapping particleMapping(blocks, accessor, sphereSelector,
                                                                               particleAndVolumeFractionFieldID, 4);
    particleMapping();
diff --git a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp
index 9b965c164c02b3b6b6f82a2c8de8593b04ac473d..f0db14fe5d902f80b1e5f293ef6dc0b5a50f64ca 100644
--- a/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp
+++ b/tests/lbm_mesapd_coupling/partially_saturated_cells_method/TorqueSphere.cpp
@@ -410,14 +410,14 @@ int main(int argc, char** argv)
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >(
-      blocks, "pdf field (zyxf)", latticeModel, Vector3< real_t >(real_c(0), real_c(0), real_c(0)), real_c(1),
-      uint_t(1), field::zyxf);
+      blocks, "pdf field (fzyx)", latticeModel, Vector3< real_t >(real_c(0), real_c(0), real_c(0)), real_c(1),
+      uint_t(1), field::fzyx);
 
    // add particle and volume fraction field
    BlockDataID particleAndVolumeFractionFieldID =
       field::addToStorage< lbm_mesapd_coupling::psm::ParticleAndVolumeFractionField_T >(
          blocks, "particle and volume fraction field",
-         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::zyxf, 0);
+         std::vector< lbm_mesapd_coupling::psm::ParticleAndVolumeFraction_T >(), field::fzyx, 0);
    // map bodies and calculate solid volume fraction initially
    lbm_mesapd_coupling::psm::ParticleAndVolumeFractionMapping particleMapping(
       blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(), particleAndVolumeFractionFieldID, 4);
diff --git a/tests/lbm_mesapd_coupling/utility/InspectionProbe.cpp b/tests/lbm_mesapd_coupling/utility/InspectionProbe.cpp
index c8c4d173e82be628dc3f79aadea30539f3fdf9bf..d4602c46e87dd391bbb49868120342518e6f9446 100644
--- a/tests/lbm_mesapd_coupling/utility/InspectionProbe.cpp
+++ b/tests/lbm_mesapd_coupling/utility/InspectionProbe.cpp
@@ -185,13 +185,13 @@ int main( int argc, char **argv )
 
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) ) );
 
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
-   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::fzyx, FieldGhostLayers );
 
    using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(MyBoundaryHandling<ParticleAccessor_T>( flagFieldID, pdfFieldID, particleFieldID, accessor), "boundary handling" );
diff --git a/tests/mesa_pd/kernel/CoefficientOfRestitutionSD.cpp b/tests/mesa_pd/kernel/CoefficientOfRestitutionSD.cpp
index c1adc3102ef9d775e8b1bf952c87a9acdbcd73af..7592db52d6d206760b9aeb89182c227cddb6ccbd 100644
--- a/tests/mesa_pd/kernel/CoefficientOfRestitutionSD.cpp
+++ b/tests/mesa_pd/kernel/CoefficientOfRestitutionSD.cpp
@@ -139,6 +139,7 @@ int main( int argc, char** argv )
          dem(acd.getIdx1(), acd.getIdx2(), *accessor, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
       }
       auto force = accessor->getForce(0);
+      WALBERLA_UNUSED(force);
 
       if(useVelocityVerlet) vvPostForce(0,*accessor);
       else implEuler(0, *accessor);
@@ -171,4 +172,4 @@ int main( int argc, char** argv )
 int main( int argc, char** argv )
 {
    return dem_integrator_accuracy::main(argc, argv);
-}
+}
\ No newline at end of file
diff --git a/tests/pde/BoundaryTest.cpp b/tests/pde/BoundaryTest.cpp
index df548787a45c42fe7ade1d91a878601c7772361b..acc3c3963399144829a5c1a67966a0f12c3a270a 100644
--- a/tests/pde/BoundaryTest.cpp
+++ b/tests/pde/BoundaryTest.cpp
@@ -295,12 +295,12 @@ int main( int argc, char** argv )
                                                       true,
                                                       false, false, false );
 
-   BlockDataID solId = field::addToStorage< Field_T >( blocks, "sol", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID rId = field::addToStorage< Field_T >( blocks, "r", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID dId = field::addToStorage< Field_T >( blocks, "d", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID zId = field::addToStorage< Field_T >( blocks, "z", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID solId = field::addToStorage< Field_T >( blocks, "sol", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID rId = field::addToStorage< Field_T >( blocks, "r", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID dId = field::addToStorage< Field_T >( blocks, "d", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID zId = field::addToStorage< Field_T >( blocks, "z", real_t(0), field::fzyx, uint_t(1) );
 
-   BlockDataID rhsId = field::addToStorage< Field_T >( blocks, "rhs", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID rhsId = field::addToStorage< Field_T >( blocks, "rhs", real_t(0), field::fzyx, uint_t(1) );
 
    initRHS( blocks, rhsId );
 
diff --git a/tests/pde/CGTest.cpp b/tests/pde/CGTest.cpp
index a14832467fa0615e4720c330372cec29aea9b7c2..810811e693ad4676a04935f4b174a2104aa0662b 100644
--- a/tests/pde/CGTest.cpp
+++ b/tests/pde/CGTest.cpp
@@ -148,14 +148,14 @@ int main( int argc, char** argv )
                                                       true,
                                                       false, false, false );
 
-   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID rId = field::addToStorage< PdeField_T >( blocks, "r", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID dId = field::addToStorage< PdeField_T >( blocks, "d", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID zId = field::addToStorage< PdeField_T >( blocks, "z", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID rId = field::addToStorage< PdeField_T >( blocks, "r", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID dId = field::addToStorage< PdeField_T >( blocks, "d", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID zId = field::addToStorage< PdeField_T >( blocks, "z", real_t(0), field::fzyx, uint_t(1) );
 
    initU( blocks, uId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    initF( blocks, fId );   
 
diff --git a/tests/pde/JacobiTest.cpp b/tests/pde/JacobiTest.cpp
index 2e65dc6ef3c69c9c19469fff47e15f2b91ffad2d..fda36b2f1dd238108318ecd12d5c91a61782a04f 100644
--- a/tests/pde/JacobiTest.cpp
+++ b/tests/pde/JacobiTest.cpp
@@ -153,12 +153,12 @@ int main( int argc, char** argv )
                                                       true,
                                                       false, false, false );
 
-   BlockDataID srcId = field::addToStorage< PdeField_T >( blocks, "u (src)", real_t(0), field::zyxf, uint_t(1) );
-   BlockDataID dstId = field::addToStorage< PdeField_T >( blocks, "u (dst)", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID srcId = field::addToStorage< PdeField_T >( blocks, "u (src)", real_t(0), field::fzyx, uint_t(1) );
+   BlockDataID dstId = field::addToStorage< PdeField_T >( blocks, "u (dst)", real_t(0), field::fzyx, uint_t(1) );
 
    initU( blocks, srcId, dstId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    initF( blocks, fId );
 
diff --git a/tests/pde/MGConvergenceTest.cpp b/tests/pde/MGConvergenceTest.cpp
index c5c66aae1a365252d2e5bd48daab89936540f0f8..a7f3902d1e7ed523d017a2eebe6163d2d7a69332 100644
--- a/tests/pde/MGConvergenceTest.cpp
+++ b/tests/pde/MGConvergenceTest.cpp
@@ -267,12 +267,12 @@ real_t runConvergenceConstStencil(const real_t xDomainSize, const real_t yDomain
 
    WALBERLA_LOG_RESULT_ON_ROOT("Discretization dx: " << dx << ", " << dy << ", " << dz);
 
-   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::fzyx, uint_t(1) );
 
    initU(blocks, uId);
    // initURect( blocks, uId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    SweepTimeloop timeloop( blocks, uint_t(1) );
 
diff --git a/tests/pde/MGTest.cpp b/tests/pde/MGTest.cpp
index 9993de251d27260773698b8f9645bd15348d4b8c..3ab49d9c22976b0c320a9b7a92e5caec86da9fbf 100644
--- a/tests/pde/MGTest.cpp
+++ b/tests/pde/MGTest.cpp
@@ -187,11 +187,11 @@ int main( int argc, char** argv )
 
    // run the main test
 
-   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::fzyx, uint_t(1) );
 
    initU( blocks, uId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    SweepTimeloop timeloop( blocks, uint_t(1) );
 
diff --git a/tests/pde/RBGSTest.cpp b/tests/pde/RBGSTest.cpp
index 6417395dbd1d10a12bb9e243212e1b88c4a8911f..a3d89f7c57f534ad5d68da4e5ab665edcbac5220 100644
--- a/tests/pde/RBGSTest.cpp
+++ b/tests/pde/RBGSTest.cpp
@@ -151,11 +151,11 @@ int main( int argc, char** argv )
                                                       true,
                                                       false, false, false );
 
-   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::fzyx, uint_t(1) );
 
    initU( blocks, uId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    initF( blocks, fId );   
 
diff --git a/tests/pde/SORTest.cpp b/tests/pde/SORTest.cpp
index 374aa0b7c332e07da760a686681d446783082719..df1eedf6ceceec6f0ac7f2fb0a063f400253dfaf 100644
--- a/tests/pde/SORTest.cpp
+++ b/tests/pde/SORTest.cpp
@@ -151,11 +151,11 @@ int main( int argc, char** argv )
                                                       true,
                                                       false, false, false );
 
-   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID uId = field::addToStorage< PdeField_T >( blocks, "u", real_t(0), field::fzyx, uint_t(1) );
 
    initU( blocks, uId );
 
-   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::zyxf, uint_t(1) );
+   BlockDataID fId = field::addToStorage< PdeField_T >( blocks, "f", real_t(0), field::fzyx, uint_t(1) );
 
    initF( blocks, fId );
    
diff --git a/tests/pe/Refinement.cpp b/tests/pe/Refinement.cpp
index 943526d15552da56f033ea22d735509c3c189291..96900a6251fc8972f2ec382cbf98c4311c00eea9 100644
--- a/tests/pe/Refinement.cpp
+++ b/tests/pe/Refinement.cpp
@@ -167,7 +167,7 @@ int main( int argc, char ** argv )
    createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(5,5,5), 1);
    createSphere(*globalStorage.get(), forest->getBlockStorage(), storageID, 0, Vec3(15,6,6), 1);
 
-   timeloop::SweepTimeloop timeloop( forest->getBlockStorage(), 1 );
+   SweepTimeloop timeloop( forest->getBlockStorage(), 1 );
    timeloop.addFuncBeforeTimeStep( simpleLB, "refreshFunctorName" );
 
    for (int i = 0; i < 1; ++i)
diff --git a/tests/pe_coupling/discrete_particle_methods/HinderedSettlingDynamicsDPM.cpp b/tests/pe_coupling/discrete_particle_methods/HinderedSettlingDynamicsDPM.cpp
index 539cf5ed93b7b08a5a2743117c9d5ea875c81fbb..69a9220c92b35f780191fb6c8bcd868539da2ba9 100644
--- a/tests/pe_coupling/discrete_particle_methods/HinderedSettlingDynamicsDPM.cpp
+++ b/tests/pe_coupling/discrete_particle_methods/HinderedSettlingDynamicsDPM.cpp
@@ -935,20 +935,20 @@ int main( int argc, char **argv )
    //////////////////////
 
    // create force field
-   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
-   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
    // create omega field
-   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // create the lattice model
    LatticeModel_T latticeModel = LatticeModel_T( omegaFieldID, ForceModel_T( forceFieldID ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel, initialFluidVelocity, real_t(1), FieldGhostLayers, field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel, initialFluidVelocity, real_t(1), FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
@@ -957,30 +957,30 @@ int main( int argc, char **argv )
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID ), "boundary handling" );
 
    // field to store fluid velocity
-   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
-   BlockDataID oldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
-   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", initialFluidVelocity, field::zyxf, FieldGhostLayers );
+   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
+   BlockDataID oldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
+   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", initialFluidVelocity, field::fzyx, FieldGhostLayers );
 
    // create pressure field
-   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // create solid volume fraction field
-   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // field to store pressure gradient
-   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store curl of fluid velocity
-   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store velocity gradient
-   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store gradient of stress tensor
-   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store time derivative of fluid velocity
-   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // communication schemes
    pe_coupling::discrete_particle_methods::CombinedReductionFieldCommunication<Vec3Field_T> forceComm( blocks, forceFieldID );
diff --git a/tests/pe_coupling/discrete_particle_methods/SphereWallCollisionBehaviorDPM.cpp b/tests/pe_coupling/discrete_particle_methods/SphereWallCollisionBehaviorDPM.cpp
index c86d7f1b8fb6b230a622b3c93b452b3d4ede7eda..97515a3e35ae8b1dfb5be99847529f1d0b575d39 100644
--- a/tests/pe_coupling/discrete_particle_methods/SphereWallCollisionBehaviorDPM.cpp
+++ b/tests/pe_coupling/discrete_particle_methods/SphereWallCollisionBehaviorDPM.cpp
@@ -738,20 +738,20 @@ int main( int argc, char **argv )
    //////////////////////
 
    // create force field
-   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID forceFieldID = field::addToStorage< Vec3Field_T >( blocks, "force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
-   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
-   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID dragForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "drag force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID amForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "am force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
+   BlockDataID liftForceFieldID = field::addToStorage< Vec3Field_T >( blocks, "lift force field", Vector3<real_t>(real_t(0)), field::fzyx, FieldGhostLayers );
 
    // create omega field
-   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID omegaFieldID = field::addToStorage< ScalarField_T >( blocks, "omega field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // create the lattice model
    LatticeModel_T latticeModel = LatticeModel_T( omegaFieldID, ForceModel_T( forceFieldID ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel, Vector3<real_t>(0), real_t(1), FieldGhostLayers, field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel, Vector3<real_t>(0), real_t(1), FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
@@ -760,29 +760,29 @@ int main( int argc, char **argv )
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID ), "boundary handling" );
 
    // field to store fluid velocity
-   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", Vector3<real_t>(0), field::zyxf, FieldGhostLayers );
-   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", Vector3<real_t>(0), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity field", Vector3<real_t>(0), field::fzyx, FieldGhostLayers );
+   BlockDataID swappedOldVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "swapped old velocity field", Vector3<real_t>(0), field::fzyx, FieldGhostLayers );
 
    // create pressure field
-   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureFieldID = field::addToStorage< ScalarField_T >( blocks, "pressure field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // create solid volume fraction field
-   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::zyxf, FieldGhostLayers );
+   BlockDataID svfFieldID = field::addToStorage< ScalarField_T >( blocks, "svf field", real_t(0), field::fzyx, FieldGhostLayers );
 
    // field to store pressure gradient
-   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID pressureGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "pressure gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store curl of fluid velocity
-   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityCurlFieldID = field::addToStorage< Vec3Field_T >( blocks, "velocity curl field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store velocity gradient
-   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID velocityGradientFieldID = field::addToStorage< TensorField_T >( blocks, "velocity gradient field", Matrix3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store gradient of stress tensor
-   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID stressTensorGradientFieldID = field::addToStorage< Vec3Field_T >( blocks, "stress tensor gradient field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // field to store time derivative of fluid velocity
-   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::zyxf, FieldGhostLayers );
+   BlockDataID timeDerivativeVelocityFieldID = field::addToStorage< Vec3Field_T >( blocks, "time derivative velocity field", Vector3<real_t>(real_c(0)), field::fzyx, FieldGhostLayers );
 
    // communication schemes
    pe_coupling::discrete_particle_methods::CombinedReductionFieldCommunication<Vec3Field_T> forceComm( blocks, forceFieldID );
diff --git a/tests/pe_coupling/momentum_exchange_method/BodyAtBlockBoarderCheck.cpp b/tests/pe_coupling/momentum_exchange_method/BodyAtBlockBoarderCheck.cpp
index 5b07af01cb1fec32b36e002523183bb16001927f..397983b419997aaa42b0388bbd8b9ae0ddd641ef 100644
--- a/tests/pe_coupling/momentum_exchange_method/BodyAtBlockBoarderCheck.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/BodyAtBlockBoarderCheck.cpp
@@ -308,15 +308,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
    // add PDF field ( uInit = <0.1,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          referenceVelocity, real_c(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/BodyMappingTest.cpp b/tests/pe_coupling/momentum_exchange_method/BodyMappingTest.cpp
index b86a57b5f2986c47ee9a15546a89f94be5733db2..175f657d78ba5ee826773a5bffba6576939b19f0 100644
--- a/tests/pe_coupling/momentum_exchange_method/BodyMappingTest.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/BodyMappingTest.cpp
@@ -448,15 +448,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field ( uInit = <0.1,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3<real_t>(real_t(0)), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID ), "boundary handling" );
diff --git a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
index 0320375a4f5677cb2a414647daaabaaf6c4bdb6c..a3ab539d7c8bd151dd0c3cda592f23aa551d4dd0 100644
--- a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
@@ -491,19 +491,19 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ), ForceModel_T( Vector3<real_t> ( setup.extForce, 0, 0 ) ) );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
    // add PDF field, for MR boundary condition only, has to be same layout as pdfField
    BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, " pre collision pdf field", latticeModel,
                                                                                Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                               uint_t(1), field::zyxf );
+                                                                               uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
index 9184f1f112ffa711501654bc32c4d382746cc96f..226127466b40eb1bf24b2e1350503e559a809521 100644
--- a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
@@ -536,15 +536,15 @@ int main( int argc, char **argv )
                                                  lbm::force_model::SimpleConstant( Vector3<real_t> ( setup.extForce, 0, 0 ) ) );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/GlobalBodyAsBoundaryMEMStaticRefinement.cpp b/tests/pe_coupling/momentum_exchange_method/GlobalBodyAsBoundaryMEMStaticRefinement.cpp
index 5cf642886a317cafdbe321e134f7afaf2a8d763c..67a3ce68cfabf4ba7cae569dfa06bbe23aba14db 100644
--- a/tests/pe_coupling/momentum_exchange_method/GlobalBodyAsBoundaryMEMStaticRefinement.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/GlobalBodyAsBoundaryMEMStaticRefinement.cpp
@@ -372,15 +372,15 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) / relaxationTime, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID ), "boundary handling" );
diff --git a/tests/pe_coupling/momentum_exchange_method/LubricationCorrectionMEM.cpp b/tests/pe_coupling/momentum_exchange_method/LubricationCorrectionMEM.cpp
index 657eae0b75e24e80872f90862ce41949e00728e6..4b0569bce75ca5f94be3128773bd52f11055971a 100644
--- a/tests/pe_coupling/momentum_exchange_method/LubricationCorrectionMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/LubricationCorrectionMEM.cpp
@@ -891,15 +891,15 @@ int main( int argc, char **argv )
    BlockDataID pdfFieldID = useFZYX ? lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel,
                                                                  Vector3< real_t >( real_t(0), real_t(0), real_t(0) ), real_t(1),
                                                                  uint_t(1), field::fzyx ) :
-                                      lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel,
+                                      lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel,
                                                                  Vector3< real_t >( real_t(0), real_t(0), real_t(0) ), real_t(1),
-                                                                 uint_t(1), field::zyxf );
+                                                                 uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries (moving walls on the front, back, top, and bottom plane)
    BlockDataID boundaryHandlingID;
diff --git a/tests/pe_coupling/momentum_exchange_method/PeriodicParticleChannelMEM.cpp b/tests/pe_coupling/momentum_exchange_method/PeriodicParticleChannelMEM.cpp
index 41ebf9d9d7fe6cf1b4c62e5a50a102cd3bb32bdb..3243b83eb12e879d5f93e651571acd489ce346db 100644
--- a/tests/pe_coupling/momentum_exchange_method/PeriodicParticleChannelMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/PeriodicParticleChannelMEM.cpp
@@ -475,15 +475,15 @@ int main( int argc, char **argv )
    BlockDataID pdfFieldID = useFZYX ? lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel,
                                                                  Vector3< real_t >( velocity, real_t(0), real_t(0) ), real_t(1),
                                                                  uint_t(1), field::fzyx ) :
-                                      lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel,
+                                      lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel,
                                                                  Vector3< real_t >( velocity, real_t(0), real_t(0) ), real_t(1),
-                                                                 uint_t(1), field::zyxf );
+                                                                 uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries (moving walls on the front, back, top, and bottom plane)
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp b/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
index 91d0dbc0132ee6f227e9a6b21d85056a76095225..9e207e2b07ed46eee4e0fb85280ffa318a4db49d 100644
--- a/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
@@ -582,14 +582,14 @@ int main( int argc, char **argv )
                                                  lbm::force_model::SimpleConstant( Vector3<real_t> ( setup.forcing, real_t(0), real_t(0) ) ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add PDF field, for MR boundary condition only
    BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, " pre collision pdf field", latticeModel,
                                                                                Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                               uint_t(1), field::zyxf );
+                                                                               uint_t(1), field::fzyx );
 
    // initialize already with the Poiseuille flow profile
    initPDF( blocks, pdfFieldID, setup);
@@ -598,7 +598,7 @@ int main( int argc, char **argv )
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEM.cpp b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEM.cpp
index 706422bc5c2f1ad9650e38f03e7fbd7b867efa17..75e53c1b55145a9ffad28c335c0cd308dcbdf9aa 100644
--- a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEM.cpp
@@ -504,14 +504,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) / relaxationTime ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID ), "boundary handling" );
diff --git a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMDynamicRefinement.cpp b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMDynamicRefinement.cpp
index 47ccab5830f68c84dd98ee3f63685b61f7a5ea1f..ffe4a043685100d5956b3be6fdcf647790672fc5 100644
--- a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMDynamicRefinement.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMDynamicRefinement.cpp
@@ -631,14 +631,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) / relaxationTime, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, bodyFieldID ),
diff --git a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMStaticRefinement.cpp b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMStaticRefinement.cpp
index d3952bab9e975d3584c40acfeeba394af28bb07f..926e9003d8c89dfc6bf8867ff14939d1d8e26ade 100644
--- a/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMStaticRefinement.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SettlingSphereMEMStaticRefinement.cpp
@@ -573,14 +573,14 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( real_t(1) / relaxationTime, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_t(0) ), real_t(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf, FieldGhostLayers );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
 
    // add boundary handling
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >( MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID ), "boundary handling" );
diff --git a/tests/pe_coupling/momentum_exchange_method/SquirmerTest.cpp b/tests/pe_coupling/momentum_exchange_method/SquirmerTest.cpp
index 1fca0831a237a19c119e5c7757dfea1b592a3979..7c02a1f60ade6dae264f8109312d47826be3dcd4 100644
--- a/tests/pe_coupling/momentum_exchange_method/SquirmerTest.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SquirmerTest.cpp
@@ -314,14 +314,14 @@ int main(int argc, char **argv) {
    LatticeModel_T latticeModel = LatticeModel_T(lbm::collision_model::TRT::constructWithMagicNumber(omega));
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage<LatticeModel_T>(blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage<LatticeModel_T>(blocks, "pdf field (fzyx)", latticeModel,
                                                                       FieldGhostLayers);
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "flag field");
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<PeBodyField_T>(blocks, "body field", nullptr, field::zyxf);
+   BlockDataID bodyFieldID = field::addToStorage<PeBodyField_T>(blocks, "body field", nullptr, field::fzyx);
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData<BoundaryHandling_T>(
diff --git a/tests/pe_coupling/momentum_exchange_method/TaylorCouetteFlowMEM.cpp b/tests/pe_coupling/momentum_exchange_method/TaylorCouetteFlowMEM.cpp
index 2fad6814a2a32c8644fbfe8998a55e9dc0b1669a..ec22f4445e8a3f43fd192bf66e9ae5a55ae64f6a 100644
--- a/tests/pe_coupling/momentum_exchange_method/TaylorCouetteFlowMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/TaylorCouetteFlowMEM.cpp
@@ -324,13 +324,13 @@ int main( int argc, char **argv )
    // add pdf field
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (zyxf)", latticeModel, Vector3< real_t >( real_t(0) ), real_t(1), uint_t(1), field::zyxf );
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage( blocks, "pdf field (fzyx)", latticeModel, Vector3< real_t >( real_t(0) ), real_t(1), uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries (moving walls on the front, back, top, and bottom plane)
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp b/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
index 0dc2dba168c8f0f48ff67acdf2ebf1c0df23472a..5807b2ca7277b111f7abf1a7c3a9c5d8cf79a1c1 100644
--- a/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
@@ -441,20 +441,20 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add PDF field, for MR boundary condition only
    BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, " pre collision pdf field", latticeModel,
                                                                                Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                               uint_t(1), field::zyxf );
+                                                                               uint_t(1), field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
 
    // add body field
-   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::zyxf );
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp b/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
index 24902900f47a2504fad7fd515c6fc983f902c3d9..b1b8d3bd6d79a6ebc2951ee34471862688906544 100644
--- a/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
+++ b/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSM.cpp
@@ -427,13 +427,13 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega, ForceModel_T( Vector3<real_t> ( setup.extForce, 0, 0 ) ) );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add body and volume fraction field
    BlockDataID bodyAndVolumeFractionFieldID = field::addToStorage< BodyAndVolumeFractionField_T >( blocks, "body and volume fraction field",
-                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::zyxf, 0 );
+                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::fzyx, 0 );
 
    // map bodies and calculate solid volume fraction initially
    pe_coupling::BodyAndVolumeFractionMapping bodyMapping( blocks, globalBodyStorage, bodyStorageID, bodyAndVolumeFractionFieldID );
diff --git a/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSMRefinement.cpp b/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSMRefinement.cpp
index d5444ab41c7122d9851fa6a0a75fcb608eccc35e..4793ca2759232e1fb77c97f0be1d950307e618b3 100644
--- a/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSMRefinement.cpp
+++ b/tests/pe_coupling/partially_saturated_cells_method/DragForceSpherePSMRefinement.cpp
@@ -540,16 +540,16 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega, lbm::force_model::SimpleConstant( Vector3<real_t> ( setup.extForce, 0, 0 ) ) );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         FieldGhostLayers, field::zyxf );
+                                                                         FieldGhostLayers, field::fzyx );
 
    // add flag field
    BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
 
    // add body and volume fraction field
    BlockDataID bodyAndVolumeFractionFieldID = field::addToStorage< BodyAndVolumeFractionField_T >( blocks, "body and volume fraction field",
-                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::zyxf, FieldGhostLayers );
+                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::fzyx, FieldGhostLayers );
 
    // add boundary handling & initialize outer domain boundaries
    BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
diff --git a/tests/pe_coupling/partially_saturated_cells_method/SegreSilberbergPSM.cpp b/tests/pe_coupling/partially_saturated_cells_method/SegreSilberbergPSM.cpp
index 5b256da44a5294bb473cb0b3f092d7e8725359af..48a499195cd779c1cdbab89437bdef3aa5e0bef6 100644
--- a/tests/pe_coupling/partially_saturated_cells_method/SegreSilberbergPSM.cpp
+++ b/tests/pe_coupling/partially_saturated_cells_method/SegreSilberbergPSM.cpp
@@ -531,9 +531,9 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega, lbm::force_model::SimpleConstant( Vector3<real_t> ( setup.forcing, real_c(0), real_c(0) ) ) );
 
    // add PDF field
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // initialize already with the Poiseuille flow profile
    initPDF( blocks, pdfFieldID, setup);
@@ -547,7 +547,7 @@ int main( int argc, char **argv )
 
    // add body and volume fraction field
    BlockDataID bodyAndVolumeFractionFieldID = field::addToStorage< BodyAndVolumeFractionField_T >( blocks, "body and volume fraction field",
-                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::zyxf, 0 );
+                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::fzyx, 0 );
    // map bodies and calculate solid volume fraction initially
    pe_coupling::BodyAndVolumeFractionMapping bodyMapping( blocks, globalBodyStorage, bodyStorageID, bodyAndVolumeFractionFieldID, pe_coupling::selectRegularBodies );
    bodyMapping();
diff --git a/tests/pe_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp b/tests/pe_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
index cd72e01bd32c73c892bfb21afe8b6db720f096ac..1065fce508230dd021210320c4c7c4471ea8da61 100644
--- a/tests/pe_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
+++ b/tests/pe_coupling/partially_saturated_cells_method/TorqueSpherePSM.cpp
@@ -376,13 +376,13 @@ int main( int argc, char **argv )
    LatticeModel_T latticeModel = LatticeModel_T( omega );
 
    // add PDF field ( uInit = <0,0,0>, rhoInit = 1 )
-   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
                                                                          Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
-                                                                         uint_t(1), field::zyxf );
+                                                                         uint_t(1), field::fzyx );
 
    // add body and volume fraction field
    BlockDataID bodyAndVolumeFractionFieldID = field::addToStorage< BodyAndVolumeFractionField_T >( blocks, "body and volume fraction field",
-                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::zyxf, 0 );
+                                                                                                   std::vector<BodyAndVolumeFraction_T>(), field::fzyx, 0 );
    // map bodies and calculate solid volume fraction initially
    pe_coupling::BodyAndVolumeFractionMapping bodyMapping( blocks, globalBodyStorage, bodyStorageID, bodyAndVolumeFractionFieldID );
    bodyMapping();
diff --git a/tests/timeloop/MultipleSweepFailTest.cpp b/tests/timeloop/MultipleSweepFailTest.cpp
index 61a8eb4c40e689788d3107b50b4e4453b1054374..ee9147a2294be689b0d07c8e07fae43f4400cc62 100644
--- a/tests/timeloop/MultipleSweepFailTest.cpp
+++ b/tests/timeloop/MultipleSweepFailTest.cpp
@@ -41,7 +41,7 @@ namespace MultipleSweepFailTest
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   mpi::Environment env(argc, argv);
+   mpi::Environment const env(argc, argv);
 
    const std::shared_ptr< StructuredBlockForest > blockForest = blockforest::createUniformBlockGrid(
       uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), real_c(1), false, false, false, false);
diff --git a/tests/timeloop/MultipleSweepTest.cpp b/tests/timeloop/MultipleSweepTest.cpp
index 3ad8248ffd047781787fada4720eddfa42033f81..4f6138efdc2b6397426d81978de564be51fb09a7 100644
--- a/tests/timeloop/MultipleSweepTest.cpp
+++ b/tests/timeloop/MultipleSweepTest.cpp
@@ -39,7 +39,7 @@ namespace MultipleSweepTest
 int main(int argc, char** argv)
 {
    debug::enterTestMode();
-   mpi::Environment env(argc, argv);
+   mpi::Environment const env(argc, argv);
 
    const std::shared_ptr< StructuredBlockForest > blockForest = blockforest::createUniformBlockGrid(
       uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), uint_c(1), real_c(1), false, false, false, false);