diff --git a/CMakeLists.txt b/CMakeLists.txt
index 087d203d7b4c54cbfd309ae078b4f3688008d9fe..e7226a9ab757ff1597e2f7ddc18ee674f5857229 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,13 @@ enable_testing()
 include_directories( src )
 include_directories ( ${your_project_name_BINARY_DIR}/src )
 
+find_program(CCACHE ccache)
+if(NOT CCACHE)
+  message(FATAL_ERROR "Cannot find ccache")
+endif()
+message(STATUS "Found ccache ${CCACHE}")
+set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE} CACHE STRING "ccache executable")
+
 # Extends cmake module path - so that FindwaLBerla.cmake in the current directory is found
 set ( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${your_project_name_SOURCE_DIR} )
 find_package( waLBerla )
diff --git a/apps/example_app_codegen/CMakeLists.txt b/apps/example_app_codegen/CMakeLists.txt
index 3ba77d89073b58032a25d1f961a051971a4ac9af..435bae3048ac67346599af5ec353b006de851234 100644
--- a/apps/example_app_codegen/CMakeLists.txt
+++ b/apps/example_app_codegen/CMakeLists.txt
@@ -1,9 +1,10 @@
 waLBerla_link_files_to_builddir( *.prm  *.py)
 
 waLBerla_generate_target_from_python(NAME LatticeModelGenerated FILE LatticeModel.py
-        OUT_FILES LatticeModel.cpp LatticeModel.h
+        OUT_FILES CumulantMRTSweep.h CumulantMRTSweep.cpp CumulantMRTPackInfo.h CumulantMRTPackInfo.cpp InitialPDFsSetter.h InitialPDFsSetter.cpp CumulantMRTNoSlip.h CumulantMRTNoSlip.cpp
+	CumulantMRTSweepAVX.h CumulantMRTSweepAVX.cpp CumulantMRTPackInfoAVX.h CumulantMRTPackInfoAVX.cpp InitialPDFsSetterAVX.h InitialPDFsSetterAVX.cpp CumulantMRTNoSlipAVX.h CumulantMRTNoSlipAVX.cpp
 )
-
+target_compile_options(LatticeModelGenerated PRIVATE -mavx2)
 
 waLBerla_add_executable ( NAME ExampleAppCodegen
                           FILES ExampleApp.cpp
diff --git a/apps/example_app_codegen/ExampleApp.cpp b/apps/example_app_codegen/ExampleApp.cpp
index 3cdda97703b2e9bf0e5ac3e7d0e09cab73168429..2b987837e578b2a39733b7c736d0bea3edf3bda9 100644
--- a/apps/example_app_codegen/ExampleApp.cpp
+++ b/apps/example_app_codegen/ExampleApp.cpp
@@ -1,61 +1,267 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file 01_BlocksAndFields.cpp
-//! \author Martin Bauer <martin.bauer@fau.de>
+//! \file 03_AdvancedLBMCodegen.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
 //
 //======================================================================================================================
 
-#include "blockforest/Initialization.h"
-#include "core/Environment.h"
-#include "field/Field.h"
-#include "gui/Gui.h"
-#include "timeloop/SweepTimeloop.h"
+#include "blockforest/all.h"
 
-namespace walberla {
+#include "core/all.h"
 
-Field<real_t, 1>* createFields(IBlock* const block, StructuredBlockStorage * const storage) {
-   return new Field<real_t,1>(storage->getNumberOfXCells(*block),
-                              storage->getNumberOfYCells(*block),
-                              storage->getNumberOfZCells(*block),
-                              real_c(0));
-}
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "cuda/AddGPUFieldToStorage.h"
+#   include "cuda/DeviceSelectMPI.h"
+#   include "cuda/HostFieldAllocator.h"
+#   include "cuda/ParallelStreams.h"
+#   include "cuda/communication/GPUPackInfo.h"
+#   include "cuda/communication/UniformGPUScheme.h"
+#endif
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/all.h"
+
+#include "stencil/D2Q9.h"
+
+#include "timeloop/all.h"
 
-int main( int argc, char ** argv )
+//    Codegen Includes
+#if defined(WALBERLA_BUILD_WITH_AVX)
+#include "CumulantMRTNoSlipAVX.h"
+#include "CumulantMRTPackInfoAVX.h"
+#include "CumulantMRTSweepAVX.h"
+#include "InitialPDFsSetterAVX.h"
+#else
+#include "CumulantMRTNoSlip.h"
+#include "CumulantMRTPackInfo.h"
+#include "CumulantMRTSweep.h"
+#include "InitialPDFsSetter.h"
+#endif
+namespace walberla
 {
-   walberla::Environment env( argc, argv );
-   
-   shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(
-                                                                                  uint_c(3), uint_c(2), uint_c(4),
-                                                                                  uint_c(10), uint_c(8), uint_c(12),
-                                                                                  real_c(0.5),
-                                                                                  false,
-                                                                                  false, false, false);
-   
-   blocks->addStructuredBlockData< Field<real_t,1> >( &createFields, "My Field" );
-
-   
-   SweepTimeloop timeloop( blocks, uint_c(1) );
-   GUI gui( timeloop, blocks, argc, argv );
-   gui.run();
+///////////////////////
+/// Typedef Aliases ///
+///////////////////////
 
-   return EXIT_SUCCESS;
-}
+// Communication Pack Info
+#if defined(WALBERLA_BUILD_WITH_AVX)
+typedef pystencils::CumulantMRTPackInfoAVX PackInfo_T;
+#else
+typedef pystencils::CumulantMRTPackInfo PackInfo_T;
+#endif
+
+// LB Method Stencil
+typedef stencil::D2Q9 Stencil_T;
+
+// PDF field type
+typedef field::GhostLayerField< real_t, Stencil_T::Size > PdfField_T;
+
+// Velocity Field Type
+typedef field::GhostLayerField< real_t, Stencil_T::D > VectorField_T;
+
+// Boundary Handling
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+#if defined(WALBERLA_BUILD_WITH_AVX)
+typedef lbm::CumulantMRTNoSlipAVX NoSlip_T;
+#else
+typedef lbm::CumulantMRTNoSlip NoSlip_T;
+#endif
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+typedef cuda::GPUField< real_t > GPUField;
+#endif
+
+//////////////////////////////////////////
+/// Shear Flow Velocity Initialization ///
+//////////////////////////////////////////
+
+void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& blocks, const BlockDataID& velocityFieldId,
+                                const Config::BlockHandle& config)
+{
+   math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42));
+
+   real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
+   real_t noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
+
+   real_t n_y = real_c(blocks->getNumberOfYCells());
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto u = (*blockIt).getData< VectorField_T >(velocityFieldId);
+
+      for (auto cellIt = u->beginWithGhostLayerXYZ(); cellIt != u->end(); ++cellIt)
+      {
+         Cell globalCell(cellIt.cell());
+         blocks->transformBlockLocalToGlobalCell(globalCell, *blockIt);
+
+         real_t relative_y = real_c(globalCell.y()) / n_y;
+
+         u->get(cellIt.cell(), 0) = relative_y < 0.3 || relative_y > 0.7 ? velocityMagnitude : -velocityMagnitude;
+
+         u->get(cellIt.cell(), 1) = noiseMagnitude * rng();
+      }
+   }
 }
 
-int main( int argc, char ** argv )
+/////////////////////
+/// Main Function ///
+/////////////////////
+
+int main(int argc, char** argv)
 {
-   return walberla::main(argc, argv);
+   walberla::Environment walberlaEnv(argc, argv);
+for (int i = 0; i < argc; ++i)std::cout << "argument " << i << ": '" << argv[i] << "'\n";
+   if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!"); }
+
+   ///////////////////////////////////////////////////////
+   /// Block Storage Creation and Simulation Parameter ///
+   ///////////////////////////////////////////////////////
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   // read parameters
+   auto parameters = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+   const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
+   const double remainingTimeLoggerFrequency =
+      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const uint_t VTKwriteFrequency = parameters.getParameter< uint_t >("VTKwriteFrequency", 1000);
+
+   ////////////////////////////////////
+   /// PDF Field and Velocity Setup ///
+   ////////////////////////////////////
+
+   // Common Fields
+   BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
+   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   // GPU Field for PDFs
+   BlockDataID pdfFieldId = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      blocks, "pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t(1));
+
+   // GPU Velocity Field
+   BlockDataID velocityFieldIdGPU =
+      cuda::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
+#else
+   // CPU Field for PDFs
+   BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx);
+#endif
+
+   // Velocity field setup
+   auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
+   initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup);
+
+   real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
+
+   // pdfs setup
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho);
+#elif defined(WALBERLA_BUILD_WITH_AVX)
+   pystencils::InitialPDFsSetterAVX pdfSetter(pdfFieldId, velocityFieldId, rho);
+#else
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
+#endif
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      pdfSetter(&(*blockIt));
+   }
+
+   /////////////
+   /// Sweep ///
+   /////////////
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
+#elif defined(WALBERLA_BUILD_WITH_AVX)
+   pystencils::CumulantMRTSweepAVX CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+#else
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+#endif
+
+   /////////////////////////
+   /// Boundary Handling ///
+   /////////////////////////
+
+   const FlagUID fluidFlagUID("Fluid");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   NoSlip_T noSlip(blocks, pdfFieldId);
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+
+   /////////////////
+   /// Time Loop ///
+   /////////////////
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   // Communication
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::communication::UniformGPUScheme< Stencil_T > com(blocks, 0);
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+   auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
+#else
+   blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+#endif
+
+   // Timeloop
+   timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip);
+   timeloop.add() << Sweep(CumulantMRTSweep);
+
+   // Time logger
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+
+   if (VTKwriteFrequency > 0)
+   {
+      const std::string path = "vtk_out/tut03";
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0,
+                                                      false, path, "simulation_step", false, true, true, false, 0);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      // Copy velocity data to CPU before output
+      vtkOutput->addBeforeFunction(
+         [&]() { cuda::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
+#endif
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
+      vtkOutput->addCellDataWriter(velWriter);
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   timeloop.run();
+
+   return EXIT_SUCCESS;
 }
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
diff --git a/apps/example_app_codegen/ExampleApp.prm b/apps/example_app_codegen/ExampleApp.prm
new file mode 100644
index 0000000000000000000000000000000000000000..d292c02d13450a5ae947f3284ded79baf2348538
--- /dev/null
+++ b/apps/example_app_codegen/ExampleApp.prm
@@ -0,0 +1,38 @@
+
+Parameters 
+{
+	omega           1.8;
+	timesteps       10001;
+
+	remainingTimeLoggerFrequency 3; // in seconds
+	VTKwriteFrequency 1000;
+}
+
+ShearFlowSetup
+{
+   rho   1.0;
+   
+   velocityMagnitude   0.08;
+   noiseMagnitude     0.005;
+
+   noiseSeed            42;
+}
+
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  300, 80, 1 >;
+   periodic      <  1,    0, 1 >;  
+}
+
+StabilityChecker
+{
+   checkFrequency 1;
+   streamOutput   false;
+   vtkOutput      true;
+}
+
+Boundaries 
+{   
+	Border { direction S,N; walldistance -1; flag NoSlip; }		
+}
diff --git a/apps/example_app_codegen/LatticeModel.py b/apps/example_app_codegen/LatticeModel.py
old mode 100644
new mode 100755
index 446f2f02bfc4783d7d41a32059ca82f0f8716639..b0d28978c984be91a90811841b212af0dd32c6c9
--- a/apps/example_app_codegen/LatticeModel.py
+++ b/apps/example_app_codegen/LatticeModel.py
@@ -1,10 +1,16 @@
+#!/usr/bin/python3
 import sympy as sp
+import pystencils as ps
 
 from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
-from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_update_rule
 
-from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel
-from lbmpy_walberla import generate_lattice_model
+from lbmpy.creationfunctions import create_lb_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.boundaries import NoSlip
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_boundary
+
 
 #   ========================
 #      General Parameters
@@ -14,21 +20,67 @@ stencil = LBStencil(Stencil.D2Q9)
 omega = sp.Symbol('omega')
 layout = 'fzyx'
 
-#   Optimizations for the LBM Method
-lbm_opt = LBMOptimisation(cse_global=True, field_layout=layout)
+#   PDF Fields
+pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): [2D]', layout=layout)
+
+#   Velocity Output Field
+velocity = ps.fields(f"velocity({stencil.D}): [2D]", layout=layout)
+output = {'velocity': velocity}
+
+# LBM Optimisation
+lbm_opt = LBMOptimisation(cse_global=True,
+                          symbolic_field=pdfs,
+                          symbolic_temporary_field=pdfs_tmp,
+                          field_layout=layout)
+
+
+#   ==================
+#      Method Setup
+#   ==================
 
-#   ===========================
-#      SRT Method Definition
-#   ===========================
+lbm_config = LBMConfig(stencil=stencil,
+                       method=Method.CUMULANT,
+                       relaxation_rate=omega,
+                       compressible=True,
+                       output=output)
 
-lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega)
+lbm_update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
 
-collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+lbm_method = lbm_update_rule.method
+
+#   ========================
+#      PDF Initialization
+#   ========================
+
+initial_rho = sp.Symbol('rho_0')
+
+pdfs_setter = macroscopic_values_setter(lbm_method,
+                                        initial_rho,
+                                        velocity.center_vector,
+                                        pdfs.center_vector)
 
 #   =====================
 #      Code Generation
 #   =====================
 
+cpu_vectorize_info = {
+    "instruction_set": "avx",
+    "assume_inner_stride_one": True,
+    "assume_aligned": True,
+    "assume_sufficient_line_padding": False}
+params_cpu = {"target": ps.Target.CPU}
+params_avx = {"target": ps.Target.CPU, "cpu_vectorize_info": cpu_vectorize_info}
+
 with CodeGeneration() as ctx:
-    # generation of the lattice model ...
-    generate_lattice_model(ctx, "LatticeModel", collision_rule, field_layout=layout)
+    for optim, params in zip(("", "AVX"), (params_cpu, params_avx)):
+        print(f"generating kernels for CPU {optim}")
+        """
+        generate_sweep(ctx, f"CumulantMRTSweep{optim}", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], **params)
+        generate_pack_info_from_kernel(ctx, f"CumulantMRTPackInfo{optim}", lbm_update_rule, **params)
+        generate_sweep(ctx, f"InitialPDFsSetter{optim}", pdfs_setter, **params)
+        generate_boundary(ctx, f"CumulantMRTNoSlip{optim}", NoSlip(), lbm_method, **params)
+        """
+        generate_sweep(ctx, f"CumulantMRTSweep{optim}", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], **params)
+        generate_pack_info_from_kernel(ctx, f"CumulantMRTPackInfo{optim}", lbm_update_rule, **params)
+        generate_sweep(ctx, f"InitialPDFsSetter{optim}", pdfs_setter, **params)
+        generate_boundary(ctx, f"CumulantMRTNoSlip{optim}", NoSlip(), lbm_method, **params)
diff --git a/apps/example_app_codegen/Readme.md b/apps/example_app_codegen/Readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cf7cfd2c17f6210ce4c79def34f6e630a586a74
--- /dev/null
+++ b/apps/example_app_codegen/Readme.md
@@ -0,0 +1,62 @@
+Set up a MWE:
+```sh
+git clone https://i10git.cs.fau.de/walberla/example_app.git
+cd example_app/apps/example_app_codegen/
+# copy .cpp, .py, .prm files, update list of generated files in CMakeLists.txt, chmod +x the python file and add shebang '#!/usr/bin/python3'
+cd $(git rev-parse --show-toplevel)
+mkdir build
+cd build
+```
+
+Compile with Clang in debug mode with lbmpy 0.4.4:
+```sh
+VERSION=0.4.4 DEPS="/work/jgrad/walberla_deps" PYTHONPATH="${DEPS}/${VERSION}/lbmpy:${DEPS}/${VERSION}/pystencils:${DEPS}/devel/walberla/python/" CC=clang CXX=clang++ cmake .. -DWALBERLA_DIR=/work/jgrad/walberla_deps/devel/walberla -DWALBERLA_BUILD_WITH_CODEGEN=ON -DCMAKE_BUILD_TYPE=Debug
+VERSION=0.4.4 DEPS="/work/jgrad/walberla_deps" PYTHONPATH="${DEPS}/${VERSION}/lbmpy:${DEPS}/${VERSION}/pystencils:${DEPS}/devel/walberla/python/" make -j$(nproc)
+```
+
+Then compile the AVX binary separately with:
+```sh
+(cd /work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen && /usr/bin/ccache /usr/bin/clang++  -DBOOST_ALL_NO_LIB -I/work/jgrad/walberla_deps/devel/example_app/build/walberla/src -I/work/jgrad/walberla_deps/devel/walberla/src -I/work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen/default_codegen -isystem /work/jgrad/walberla_deps/devel/example_app/src -isystem /work/jgrad/walberla_deps/devel/example_app/build/src -isystem /work/jgrad/walberla_deps/0.4.4/pystencils/pystencils/include -isystem /usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -isystem /usr/lib/x86_64-linux-gnu/openmpi/include  -Wall -Wconversion -Wshadow -Wno-c++11-extensions -Qunused-arguments -pthread -pthread -g   -std=gnu++17 -DWALBERLA_BUILD_WITH_AVX -o CMakeFiles/ExampleAppCodegen.dir/ExampleAppAVX.cpp.o -c /work/jgrad/walberla_deps/devel/example_app/apps/example_app_codegen/ExampleApp.cpp)
+(cd /work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen && /tikhome/jgrad/.local/lib/python3.8/site-packages/cmake/data/bin/cmake -E cmake_link_script CMakeFiles/ExampleAppCodegen.dir/link.txt --verbose=1
+/usr/bin/clang++   -Wall -Wconversion -Wshadow -Wno-c++11-extensions -Qunused-arguments -pthread -pthread -g    CMakeFiles/ExampleAppCodegen.dir/ExampleAppAVX.cpp.o  -o ExampleAppCodegenAVX  -Wl,-rpath,/usr/lib/x86_64-linux-gnu/openmpi/lib ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi_cxx.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/libpfft.so /usr/lib/x86_64-linux-gnu/libfftw3.so /usr/lib/x86_64-linux-gnu/libfftw3_mpi.so ../../walberla/extern/lodepng/liblodepng.a /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi_cxx.so /usr/lib/libpfft.so /usr/lib/x86_64-linux-gnu/libfftw3.so /usr/lib/x86_64-linux-gnu/libfftw3_mpi.so)
+```
+
+Run the binaries with the parameter files:
+```sh
+apps/example_app_codegen/ExampleAppCodegen ../apps/example_app_codegen/ExampleApp.prm
+apps/example_app_codegen/ExampleAppCodegenAVX ../apps/example_app_codegen/ExampleApp.prm
+```
+
+The AVX binary will fail at random with a SIGSEV, because the fields
+are allocated with 8-byte alignment, although 32-byte alignment is
+required to safely load doubles in memory. The `src/field/Field.impl.h`
+file has ifdefs to select the correct alignment if AVX2 is defined,
+however:
+
+* the `alignment` value is 16 instead of 32
+* the `sizeof(T) < alignment` uses `T=const float [13]`, but the conditional
+  was probably meant to test a hypothetical type `T_underlying=const float`
+* the conditional evaluates to `false` but takes the `true` branch in GDB
+  (in the ESPResSo bridge, the `false` branch is taken)
+* the `allocator_` shared pointer should dereference to
+  a `walberla::field::AllocateAligned<unsigned char, 16>` object,
+  but instead it dereferences to a generic allocator with 8-byte alignment
+
+GDB setup:
+```
+gdb --args apps/example_app_codegen/ExampleAppCodegenAVX ../apps/example_app_codegen/ExampleApp.prm
+(gdb) b /work/jgrad/walberla_deps/devel/walberla/src/field/Field.impl.h:341
+(gdb) run
+(gdb) tui e
+```
+
+Then in GDB, the execution was stepped through to check the values in the conditional as well
+as the allocated pointer, with is often 8-byte aligned instead of 16-byte or 32-byte aligned:
+```
+(gdb) print mem
+$1 = (double *) 0x15554d528028
+(gdb) python print(0x15554d528028 / 32)
+733003551745.25
+```
+
+Then run `continue` until the SIGSEV is hit.