diff --git a/CMakeLists.txt b/CMakeLists.txt index 087d203d7b4c54cbfd309ae078b4f3688008d9fe..e7226a9ab757ff1597e2f7ddc18ee674f5857229 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,13 @@ enable_testing() include_directories( src ) include_directories ( ${your_project_name_BINARY_DIR}/src ) +find_program(CCACHE ccache) +if(NOT CCACHE) + message(FATAL_ERROR "Cannot find ccache") +endif() +message(STATUS "Found ccache ${CCACHE}") +set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE} CACHE STRING "ccache executable") + # Extends cmake module path - so that FindwaLBerla.cmake in the current directory is found set ( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${your_project_name_SOURCE_DIR} ) find_package( waLBerla ) diff --git a/apps/example_app_codegen/CMakeLists.txt b/apps/example_app_codegen/CMakeLists.txt index 3ba77d89073b58032a25d1f961a051971a4ac9af..435bae3048ac67346599af5ec353b006de851234 100644 --- a/apps/example_app_codegen/CMakeLists.txt +++ b/apps/example_app_codegen/CMakeLists.txt @@ -1,9 +1,10 @@ waLBerla_link_files_to_builddir( *.prm *.py) waLBerla_generate_target_from_python(NAME LatticeModelGenerated FILE LatticeModel.py - OUT_FILES LatticeModel.cpp LatticeModel.h + OUT_FILES CumulantMRTSweep.h CumulantMRTSweep.cpp CumulantMRTPackInfo.h CumulantMRTPackInfo.cpp InitialPDFsSetter.h InitialPDFsSetter.cpp CumulantMRTNoSlip.h CumulantMRTNoSlip.cpp + CumulantMRTSweepAVX.h CumulantMRTSweepAVX.cpp CumulantMRTPackInfoAVX.h CumulantMRTPackInfoAVX.cpp InitialPDFsSetterAVX.h InitialPDFsSetterAVX.cpp CumulantMRTNoSlipAVX.h CumulantMRTNoSlipAVX.cpp ) - +target_compile_options(LatticeModelGenerated PRIVATE -mavx2) waLBerla_add_executable ( NAME ExampleAppCodegen FILES ExampleApp.cpp diff --git a/apps/example_app_codegen/ExampleApp.cpp b/apps/example_app_codegen/ExampleApp.cpp index 3cdda97703b2e9bf0e5ac3e7d0e09cab73168429..2b987837e578b2a39733b7c736d0bea3edf3bda9 100644 --- a/apps/example_app_codegen/ExampleApp.cpp +++ b/apps/example_app_codegen/ExampleApp.cpp @@ -1,61 +1,267 @@ //====================================================================================================================== // -// This file is part of waLBerla. waLBerla is free software: you can +// This file is part of waLBerla. waLBerla is free software: you can // redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of +// License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. -// +// // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // -//! \file 01_BlocksAndFields.cpp -//! \author Martin Bauer <martin.bauer@fau.de> +//! \file 03_AdvancedLBMCodegen.cpp +//! \author Frederik Hennig <frederik.hennig@fau.de> // //====================================================================================================================== -#include "blockforest/Initialization.h" -#include "core/Environment.h" -#include "field/Field.h" -#include "gui/Gui.h" -#include "timeloop/SweepTimeloop.h" +#include "blockforest/all.h" -namespace walberla { +#include "core/all.h" -Field<real_t, 1>* createFields(IBlock* const block, StructuredBlockStorage * const storage) { - return new Field<real_t,1>(storage->getNumberOfXCells(*block), - storage->getNumberOfYCells(*block), - storage->getNumberOfZCells(*block), - real_c(0)); -} +#if defined(WALBERLA_BUILD_WITH_CUDA) +# include "cuda/AddGPUFieldToStorage.h" +# include "cuda/DeviceSelectMPI.h" +# include "cuda/HostFieldAllocator.h" +# include "cuda/ParallelStreams.h" +# include "cuda/communication/GPUPackInfo.h" +# include "cuda/communication/UniformGPUScheme.h" +#endif + +#include "domain_decomposition/all.h" + +#include "field/all.h" +#include "field/vtk/VTKWriter.h" + +#include "geometry/all.h" + +#include "stencil/D2Q9.h" + +#include "timeloop/all.h" -int main( int argc, char ** argv ) +// Codegen Includes +#if defined(WALBERLA_BUILD_WITH_AVX) +#include "CumulantMRTNoSlipAVX.h" +#include "CumulantMRTPackInfoAVX.h" +#include "CumulantMRTSweepAVX.h" +#include "InitialPDFsSetterAVX.h" +#else +#include "CumulantMRTNoSlip.h" +#include "CumulantMRTPackInfo.h" +#include "CumulantMRTSweep.h" +#include "InitialPDFsSetter.h" +#endif +namespace walberla { - walberla::Environment env( argc, argv ); - - shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid( - uint_c(3), uint_c(2), uint_c(4), - uint_c(10), uint_c(8), uint_c(12), - real_c(0.5), - false, - false, false, false); - - blocks->addStructuredBlockData< Field<real_t,1> >( &createFields, "My Field" ); - - - SweepTimeloop timeloop( blocks, uint_c(1) ); - GUI gui( timeloop, blocks, argc, argv ); - gui.run(); +/////////////////////// +/// Typedef Aliases /// +/////////////////////// - return EXIT_SUCCESS; -} +// Communication Pack Info +#if defined(WALBERLA_BUILD_WITH_AVX) +typedef pystencils::CumulantMRTPackInfoAVX PackInfo_T; +#else +typedef pystencils::CumulantMRTPackInfo PackInfo_T; +#endif + +// LB Method Stencil +typedef stencil::D2Q9 Stencil_T; + +// PDF field type +typedef field::GhostLayerField< real_t, Stencil_T::Size > PdfField_T; + +// Velocity Field Type +typedef field::GhostLayerField< real_t, Stencil_T::D > VectorField_T; + +// Boundary Handling +typedef walberla::uint8_t flag_t; +typedef FlagField< flag_t > FlagField_T; +#if defined(WALBERLA_BUILD_WITH_AVX) +typedef lbm::CumulantMRTNoSlipAVX NoSlip_T; +#else +typedef lbm::CumulantMRTNoSlip NoSlip_T; +#endif + +#if defined(WALBERLA_BUILD_WITH_CUDA) +typedef cuda::GPUField< real_t > GPUField; +#endif + +////////////////////////////////////////// +/// Shear Flow Velocity Initialization /// +////////////////////////////////////////// + +void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& blocks, const BlockDataID& velocityFieldId, + const Config::BlockHandle& config) +{ + math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42)); + + real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08)); + real_t noiseMagnitude = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude); + + real_t n_y = real_c(blocks->getNumberOfYCells()); + + for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) + { + auto u = (*blockIt).getData< VectorField_T >(velocityFieldId); + + for (auto cellIt = u->beginWithGhostLayerXYZ(); cellIt != u->end(); ++cellIt) + { + Cell globalCell(cellIt.cell()); + blocks->transformBlockLocalToGlobalCell(globalCell, *blockIt); + + real_t relative_y = real_c(globalCell.y()) / n_y; + + u->get(cellIt.cell(), 0) = relative_y < 0.3 || relative_y > 0.7 ? velocityMagnitude : -velocityMagnitude; + + u->get(cellIt.cell(), 1) = noiseMagnitude * rng(); + } + } } -int main( int argc, char ** argv ) +///////////////////// +/// Main Function /// +///////////////////// + +int main(int argc, char** argv) { - return walberla::main(argc, argv); + walberla::Environment walberlaEnv(argc, argv); +for (int i = 0; i < argc; ++i)std::cout << "argument " << i << ": '" << argv[i] << "'\n"; + if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!"); } + + /////////////////////////////////////////////////////// + /// Block Storage Creation and Simulation Parameter /// + /////////////////////////////////////////////////////// + + auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config()); + + // read parameters + auto parameters = walberlaEnv.config()->getOneBlock("Parameters"); + + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10)); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.8)); + const double remainingTimeLoggerFrequency = + parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds + const uint_t VTKwriteFrequency = parameters.getParameter< uint_t >("VTKwriteFrequency", 1000); + + //////////////////////////////////// + /// PDF Field and Velocity Setup /// + //////////////////////////////////// + + // Common Fields + BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx); + BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); + +#if defined(WALBERLA_BUILD_WITH_CUDA) + // GPU Field for PDFs + BlockDataID pdfFieldId = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >( + blocks, "pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t(1)); + + // GPU Velocity Field + BlockDataID velocityFieldIdGPU = + cuda::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true); +#else + // CPU Field for PDFs + BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx); +#endif + + // Velocity field setup + auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup"); + initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup); + + real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0)); + + // pdfs setup +#if defined(WALBERLA_BUILD_WITH_CUDA) + cuda::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId); + pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho); +#elif defined(WALBERLA_BUILD_WITH_AVX) + pystencils::InitialPDFsSetterAVX pdfSetter(pdfFieldId, velocityFieldId, rho); +#else + pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho); +#endif + + for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) + { + pdfSetter(&(*blockIt)); + } + + ///////////// + /// Sweep /// + ///////////// + +#if defined(WALBERLA_BUILD_WITH_CUDA) + pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega); +#elif defined(WALBERLA_BUILD_WITH_AVX) + pystencils::CumulantMRTSweepAVX CumulantMRTSweep(pdfFieldId, velocityFieldId, omega); +#else + pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega); +#endif + + ///////////////////////// + /// Boundary Handling /// + ///////////////////////// + + const FlagUID fluidFlagUID("Fluid"); + + auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries"); + + NoSlip_T noSlip(blocks, pdfFieldId); + + geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID); + + noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID); + + ///////////////// + /// Time Loop /// + ///////////////// + + SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps); + + // Communication +#if defined(WALBERLA_BUILD_WITH_CUDA) + cuda::communication::UniformGPUScheme< Stencil_T > com(blocks, 0); + com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); + auto communication = std::function< void() >([&]() { com.communicate(nullptr); }); +#else + blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); + communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); +#endif + + // Timeloop + timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip); + timeloop.add() << Sweep(CumulantMRTSweep); + + // Time logger + timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), + "remaining time logger"); + + if (VTKwriteFrequency > 0) + { + const std::string path = "vtk_out/tut03"; + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0, + false, path, "simulation_step", false, true, true, false, 0); + +#if defined(WALBERLA_BUILD_WITH_CUDA) + // Copy velocity data to CPU before output + vtkOutput->addBeforeFunction( + [&]() { cuda::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); }); +#endif + + auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity"); + vtkOutput->addCellDataWriter(velWriter); + + timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + timeloop.run(); + + return EXIT_SUCCESS; } + +} // namespace walberla + +int main(int argc, char** argv) { return walberla::main(argc, argv); } diff --git a/apps/example_app_codegen/ExampleApp.prm b/apps/example_app_codegen/ExampleApp.prm new file mode 100644 index 0000000000000000000000000000000000000000..d292c02d13450a5ae947f3284ded79baf2348538 --- /dev/null +++ b/apps/example_app_codegen/ExampleApp.prm @@ -0,0 +1,38 @@ + +Parameters +{ + omega 1.8; + timesteps 10001; + + remainingTimeLoggerFrequency 3; // in seconds + VTKwriteFrequency 1000; +} + +ShearFlowSetup +{ + rho 1.0; + + velocityMagnitude 0.08; + noiseMagnitude 0.005; + + noiseSeed 42; +} + +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 300, 80, 1 >; + periodic < 1, 0, 1 >; +} + +StabilityChecker +{ + checkFrequency 1; + streamOutput false; + vtkOutput true; +} + +Boundaries +{ + Border { direction S,N; walldistance -1; flag NoSlip; } +} diff --git a/apps/example_app_codegen/LatticeModel.py b/apps/example_app_codegen/LatticeModel.py old mode 100644 new mode 100755 index 446f2f02bfc4783d7d41a32059ca82f0f8716639..b0d28978c984be91a90811841b212af0dd32c6c9 --- a/apps/example_app_codegen/LatticeModel.py +++ b/apps/example_app_codegen/LatticeModel.py @@ -1,10 +1,16 @@ +#!/usr/bin/python3 import sympy as sp +import pystencils as ps from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil -from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_update_rule -from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel -from lbmpy_walberla import generate_lattice_model +from lbmpy.creationfunctions import create_lb_update_rule +from lbmpy.macroscopic_value_kernels import macroscopic_values_setter +from lbmpy.boundaries import NoSlip + +from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel +from lbmpy_walberla import generate_boundary + # ======================== # General Parameters @@ -14,21 +20,67 @@ stencil = LBStencil(Stencil.D2Q9) omega = sp.Symbol('omega') layout = 'fzyx' -# Optimizations for the LBM Method -lbm_opt = LBMOptimisation(cse_global=True, field_layout=layout) +# PDF Fields +pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): [2D]', layout=layout) + +# Velocity Output Field +velocity = ps.fields(f"velocity({stencil.D}): [2D]", layout=layout) +output = {'velocity': velocity} + +# LBM Optimisation +lbm_opt = LBMOptimisation(cse_global=True, + symbolic_field=pdfs, + symbolic_temporary_field=pdfs_tmp, + field_layout=layout) + + +# ================== +# Method Setup +# ================== -# =========================== -# SRT Method Definition -# =========================== +lbm_config = LBMConfig(stencil=stencil, + method=Method.CUMULANT, + relaxation_rate=omega, + compressible=True, + output=output) -lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega) +lbm_update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) -collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) +lbm_method = lbm_update_rule.method + +# ======================== +# PDF Initialization +# ======================== + +initial_rho = sp.Symbol('rho_0') + +pdfs_setter = macroscopic_values_setter(lbm_method, + initial_rho, + velocity.center_vector, + pdfs.center_vector) # ===================== # Code Generation # ===================== +cpu_vectorize_info = { + "instruction_set": "avx", + "assume_inner_stride_one": True, + "assume_aligned": True, + "assume_sufficient_line_padding": False} +params_cpu = {"target": ps.Target.CPU} +params_avx = {"target": ps.Target.CPU, "cpu_vectorize_info": cpu_vectorize_info} + with CodeGeneration() as ctx: - # generation of the lattice model ... - generate_lattice_model(ctx, "LatticeModel", collision_rule, field_layout=layout) + for optim, params in zip(("", "AVX"), (params_cpu, params_avx)): + print(f"generating kernels for CPU {optim}") + """ + generate_sweep(ctx, f"CumulantMRTSweep{optim}", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], **params) + generate_pack_info_from_kernel(ctx, f"CumulantMRTPackInfo{optim}", lbm_update_rule, **params) + generate_sweep(ctx, f"InitialPDFsSetter{optim}", pdfs_setter, **params) + generate_boundary(ctx, f"CumulantMRTNoSlip{optim}", NoSlip(), lbm_method, **params) + """ + generate_sweep(ctx, f"CumulantMRTSweep{optim}", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], **params) + generate_pack_info_from_kernel(ctx, f"CumulantMRTPackInfo{optim}", lbm_update_rule, **params) + generate_sweep(ctx, f"InitialPDFsSetter{optim}", pdfs_setter, **params) + generate_boundary(ctx, f"CumulantMRTNoSlip{optim}", NoSlip(), lbm_method, **params) diff --git a/apps/example_app_codegen/Readme.md b/apps/example_app_codegen/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..2cf7cfd2c17f6210ce4c79def34f6e630a586a74 --- /dev/null +++ b/apps/example_app_codegen/Readme.md @@ -0,0 +1,62 @@ +Set up a MWE: +```sh +git clone https://i10git.cs.fau.de/walberla/example_app.git +cd example_app/apps/example_app_codegen/ +# copy .cpp, .py, .prm files, update list of generated files in CMakeLists.txt, chmod +x the python file and add shebang '#!/usr/bin/python3' +cd $(git rev-parse --show-toplevel) +mkdir build +cd build +``` + +Compile with Clang in debug mode with lbmpy 0.4.4: +```sh +VERSION=0.4.4 DEPS="/work/jgrad/walberla_deps" PYTHONPATH="${DEPS}/${VERSION}/lbmpy:${DEPS}/${VERSION}/pystencils:${DEPS}/devel/walberla/python/" CC=clang CXX=clang++ cmake .. -DWALBERLA_DIR=/work/jgrad/walberla_deps/devel/walberla -DWALBERLA_BUILD_WITH_CODEGEN=ON -DCMAKE_BUILD_TYPE=Debug +VERSION=0.4.4 DEPS="/work/jgrad/walberla_deps" PYTHONPATH="${DEPS}/${VERSION}/lbmpy:${DEPS}/${VERSION}/pystencils:${DEPS}/devel/walberla/python/" make -j$(nproc) +``` + +Then compile the AVX binary separately with: +```sh +(cd /work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen && /usr/bin/ccache /usr/bin/clang++ -DBOOST_ALL_NO_LIB -I/work/jgrad/walberla_deps/devel/example_app/build/walberla/src -I/work/jgrad/walberla_deps/devel/walberla/src -I/work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen/default_codegen -isystem /work/jgrad/walberla_deps/devel/example_app/src -isystem /work/jgrad/walberla_deps/devel/example_app/build/src -isystem /work/jgrad/walberla_deps/0.4.4/pystencils/pystencils/include -isystem /usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -isystem /usr/lib/x86_64-linux-gnu/openmpi/include -Wall -Wconversion -Wshadow -Wno-c++11-extensions -Qunused-arguments -pthread -pthread -g -std=gnu++17 -DWALBERLA_BUILD_WITH_AVX -o CMakeFiles/ExampleAppCodegen.dir/ExampleAppAVX.cpp.o -c /work/jgrad/walberla_deps/devel/example_app/apps/example_app_codegen/ExampleApp.cpp) +(cd /work/jgrad/walberla_deps/devel/example_app/build/apps/example_app_codegen && /tikhome/jgrad/.local/lib/python3.8/site-packages/cmake/data/bin/cmake -E cmake_link_script CMakeFiles/ExampleAppCodegen.dir/link.txt --verbose=1 +/usr/bin/clang++ -Wall -Wconversion -Wshadow -Wno-c++11-extensions -Qunused-arguments -pthread -pthread -g CMakeFiles/ExampleAppCodegen.dir/ExampleAppAVX.cpp.o -o ExampleAppCodegenAVX -Wl,-rpath,/usr/lib/x86_64-linux-gnu/openmpi/lib ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a ../../walberla/src/blockforest/libblockforest.a ../../walberla/src/core/libcore.a ../../walberla/src/field/libfield.a ../../walberla/src/lbm/liblbm.a ../../walberla/src/geometry/libgeometry.a ../../walberla/src/timeloop/libtimeloop.a ../../walberla/src/gui/libgui.a libLatticeModelGenerated.a ../../walberla/src/domain_decomposition/libdomain_decomposition.a ../../walberla/src/vtk/libvtk.a ../../walberla/src/boundary/libboundary.a /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi_cxx.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/libpfft.so /usr/lib/x86_64-linux-gnu/libfftw3.so /usr/lib/x86_64-linux-gnu/libfftw3_mpi.so ../../walberla/extern/lodepng/liblodepng.a /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi_cxx.so /usr/lib/libpfft.so /usr/lib/x86_64-linux-gnu/libfftw3.so /usr/lib/x86_64-linux-gnu/libfftw3_mpi.so) +``` + +Run the binaries with the parameter files: +```sh +apps/example_app_codegen/ExampleAppCodegen ../apps/example_app_codegen/ExampleApp.prm +apps/example_app_codegen/ExampleAppCodegenAVX ../apps/example_app_codegen/ExampleApp.prm +``` + +The AVX binary will fail at random with a SIGSEV, because the fields +are allocated with 8-byte alignment, although 32-byte alignment is +required to safely load doubles in memory. The `src/field/Field.impl.h` +file has ifdefs to select the correct alignment if AVX2 is defined, +however: + +* the `alignment` value is 16 instead of 32 +* the `sizeof(T) < alignment` uses `T=const float [13]`, but the conditional + was probably meant to test a hypothetical type `T_underlying=const float` +* the conditional evaluates to `false` but takes the `true` branch in GDB + (in the ESPResSo bridge, the `false` branch is taken) +* the `allocator_` shared pointer should dereference to + a `walberla::field::AllocateAligned<unsigned char, 16>` object, + but instead it dereferences to a generic allocator with 8-byte alignment + +GDB setup: +``` +gdb --args apps/example_app_codegen/ExampleAppCodegenAVX ../apps/example_app_codegen/ExampleApp.prm +(gdb) b /work/jgrad/walberla_deps/devel/walberla/src/field/Field.impl.h:341 +(gdb) run +(gdb) tui e +``` + +Then in GDB, the execution was stepped through to check the values in the conditional as well +as the allocated pointer, with is often 8-byte aligned instead of 16-byte or 32-byte aligned: +``` +(gdb) print mem +$1 = (double *) 0x15554d528028 +(gdb) python print(0x15554d528028 / 32) +733003551745.25 +``` + +Then run `continue` until the SIGSEV is hit.