diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index f9c1d6e7a8feeb0d4bfbb6a1e55ab2946c2b8cee..41519581d10865c5a1f6c405178ab79725ba7776 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -21,6 +21,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) add_subdirectory( FieldCommunication ) if ( WALBERLA_BUILD_WITH_CODEGEN ) + add_subdirectory( FlowAroundSphereCodeGen ) add_subdirectory( UniformGridGenerated ) add_subdirectory( PhaseFieldAllenCahn ) endif() diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f7a128f33735ef9365c6daca215de07f7a09ac0 --- /dev/null +++ b/apps/benchmarks/FlowAroundSphereCodeGen/CMakeLists.txt @@ -0,0 +1,31 @@ +waLBerla_link_files_to_builddir( "*.py" ) + +if (WALBERLA_BUILD_WITH_CUDA) + waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated + FILE FlowAroundSphereCodeGen.py + OUT_FILES FlowAroundSphereCodeGen_LbSweep.cu FlowAroundSphereCodeGen_LbSweep.h + FlowAroundSphereCodeGen_MacroSetter.cu FlowAroundSphereCodeGen_MacroSetter.h + FlowAroundSphereCodeGen_UBB.cu FlowAroundSphereCodeGen_UBB.h + FlowAroundSphereCodeGen_NoSlip.cu FlowAroundSphereCodeGen_NoSlip.h + FlowAroundSphereCodeGen_Outflow.cu FlowAroundSphereCodeGen_Outflow.h + FlowAroundSphereCodeGen_PackInfoEven.cu FlowAroundSphereCodeGen_PackInfoEven.h + FlowAroundSphereCodeGen_PackInfoOdd.cu FlowAroundSphereCodeGen_PackInfoOdd.h + FlowAroundSphereCodeGen_InfoHeader.h) + waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp + DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk FlowAroundSphereGenerated) + set_target_properties( FlowAroundSphereCodeGen PROPERTIES CXX_VISIBILITY_PRESET hidden) +else () + waLBerla_generate_target_from_python(NAME FlowAroundSphereGenerated + FILE FlowAroundSphereCodeGen.py + OUT_FILES FlowAroundSphereCodeGen_LbSweep.cpp FlowAroundSphereCodeGen_LbSweep.h + FlowAroundSphereCodeGen_MacroSetter.cpp FlowAroundSphereCodeGen_MacroSetter.h + FlowAroundSphereCodeGen_UBB.cpp FlowAroundSphereCodeGen_UBB.h + FlowAroundSphereCodeGen_NoSlip.cpp FlowAroundSphereCodeGen_NoSlip.h + FlowAroundSphereCodeGen_Outflow.cpp FlowAroundSphereCodeGen_Outflow.h + FlowAroundSphereCodeGen_PackInfoEven.cpp FlowAroundSphereCodeGen_PackInfoEven.h + FlowAroundSphereCodeGen_PackInfoOdd.cpp FlowAroundSphereCodeGen_PackInfoOdd.h + FlowAroundSphereCodeGen_InfoHeader.h) + waLBerla_add_executable( NAME FlowAroundSphereCodeGen FILE FlowAroundSphereCodeGen.cpp + DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk FlowAroundSphereGenerated) + set_target_properties( FlowAroundSphereCodeGen PROPERTIES CXX_VISIBILITY_PRESET hidden) +endif() \ No newline at end of file diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp new file mode 100644 index 0000000000000000000000000000000000000000..68aab5882901317263c04045b69ac0a6de403adc --- /dev/null +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp @@ -0,0 +1,307 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FlowAroundSphereCodeGen.cpp +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== +#include "blockforest/all.h" + +#include "core/all.h" + +#include "domain_decomposition/all.h" + +#include "field/all.h" + +#include "geometry/all.h" + +#include "lbm/inplace_streaming/TimestepTracker.h" +#include "lbm/vtk/QCriterion.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" + +#include "timeloop/all.h" + +#if defined(WALBERLA_BUILD_WITH_CUDA) +# include "cuda/AddGPUFieldToStorage.h" +# include "cuda/DeviceSelectMPI.h" +# include "cuda/HostFieldAllocator.h" +# include "cuda/NVTX.h" +# include "cuda/ParallelStreams.h" +# include "cuda/communication/GPUPackInfo.h" +# include "cuda/communication/UniformGPUScheme.h" +#endif + +// CodeGen includes +#include "FlowAroundSphereCodeGen_InfoHeader.h" + +namespace walberla +{ +typedef lbm::FlowAroundSphereCodeGen_PackInfoEven PackInfoEven_T; +typedef lbm::FlowAroundSphereCodeGen_PackInfoOdd PackInfoOdd_T; + +typedef walberla::uint8_t flag_t; +typedef FlagField< flag_t > FlagField_T; + +#if defined(WALBERLA_BUILD_WITH_CUDA) +typedef cuda::GPUField< real_t > GPUField; +#endif + +using namespace std::placeholders; + +auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) { + return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block), + storage->getNumberOfZCells(*block), uint_t(1), field::fzyx, + make_shared< field::AllocateAligned< real_t, 64 > >()); +}; + +auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block, + real_t inflow_velocity) { + Cell globalCell; + CellInterval domain = SbF->getDomainCellBB(); + real_t h_y = domain.yMax() - domain.yMin(); + real_t h_z = domain.zMax() - domain.zMin(); + SbF->transformBlockLocalToGlobalCell(globalCell, block, pos); + + real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5); + real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5); + + real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) * + (h_z / 2 + z1); + + Vector3< real_t > result(u, 0.0, 0.0); + return result; +}; + +class AlternatingBeforeFunction +{ + public: + typedef std::function< void() > BeforeFunction; + + AlternatingBeforeFunction(BeforeFunction evenFunc, BeforeFunction oddFunc, + std::shared_ptr< lbm::TimestepTracker >& tracker) + : tracker_(tracker), funcs_{ evenFunc, oddFunc } {}; + + void operator()() { funcs_[tracker_->getCounter()](); } + + private: + std::shared_ptr< lbm::TimestepTracker > tracker_; + std::vector< BeforeFunction > funcs_; +}; + +class Filter +{ + public: + explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {} + + void operator()(const IBlock& /*block*/) {} + + bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const + { + return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) && + z >= -1 && z <= cell_idx_t(numberOfCells_[2]); + } + + private: + Vector3< uint_t > numberOfCells_; +}; + +using FluidFilter_T = Filter; + +int main(int argc, char** argv) +{ + walberla::Environment walberlaEnv(argc, argv); +#if defined(WALBERLA_BUILD_WITH_CUDA) + cuda::selectDeviceBasedOnMpiRank(); +#endif + + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) + { + WALBERLA_MPI_WORLD_BARRIER() + + auto config = *cfg; + logging::configureLogging(config); + auto blocks = blockforest::createUniformBlockGridFromConfig(config); + + // read parameters + Vector3< uint_t > cellsPerBlock = + config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); + auto parameters = config->getOneBlock("Parameters"); + + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10)); + const real_t omega = parameters.getParameter< real_t >("omega", real_t(1.9)); + const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05)); + const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000)); + const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5)); + + const double remainingTimeLoggerFrequency = + parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds + + // create fields + BlockDataID pdfFieldID = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "PDFs"); + BlockDataID velFieldID = field::addToStorage< VelocityField_T >(blocks, "velocity", real_t(0), field::fzyx); + BlockDataID densityFieldID = field::addToStorage< ScalarField_T >(blocks, "density", real_t(0), field::fzyx); + +#if defined(WALBERLA_BUILD_WITH_CUDA) + BlockDataID pdfFieldIDGPU = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "PDFs on GPU", true); + BlockDataID velFieldIDGPU = + cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldID, "velocity on GPU", true); + BlockDataID densityFieldIDGPU = + cuda::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldID, "density on GPU", true); +#endif + + BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); + + // initialise all PDFs +#if defined(WALBERLA_BUILD_WITH_CUDA) + pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldIDGPU, velFieldIDGPU); + for (auto& block : *blocks) + setterSweep(&block); + cuda::fieldCpy< PdfField_T, GPUField >(blocks, pdfFieldID, pdfFieldIDGPU); +#else + pystencils::FlowAroundSphereCodeGen_MacroSetter setterSweep(pdfFieldID, velFieldID); + for (auto& block : *blocks) + setterSweep(&block); +#endif + // Create communication + +#if defined(WALBERLA_BUILD_WITH_CUDA) + // This way of using alternating pack infos is temporary and will soon be replaced + // by something more straight-forward + + cuda::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false); + comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU)); + auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); }); + + cuda::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false); + comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU)); + auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); }); +#else + blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks); + evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID)); + + blockforest::communication::UniformBufferedScheme< Stencil_T > oddComm(blocks); + oddComm.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldID)); +#endif + + // create and initialize boundary handling + const FlagUID fluidFlagUID("Fluid"); + + auto boundariesConfig = config->getOneBlock("Boundaries"); + + std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > + velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max); + +#if defined(WALBERLA_BUILD_WITH_CUDA) + lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation); + lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldIDGPU); + lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldIDGPU, pdfFieldID); + + lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldIDGPU, pdfFieldIDGPU, velFieldIDGPU, omega); +#else + lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldID, velocity_initialisation); + lbm::FlowAroundSphereCodeGen_NoSlip noSlip(blocks, pdfFieldID); + lbm::FlowAroundSphereCodeGen_Outflow outflow(blocks, pdfFieldID); + + lbm::FlowAroundSphereCodeGen_LbSweep lbSweep(densityFieldID, pdfFieldID, velFieldID, omega); +#endif + + geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID); + + ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("UBB"), fluidFlagUID); + noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID); + outflow.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("Outflow"), fluidFlagUID); + + // create time loop + SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps); + + // Timestep Tracking and Sweeps + auto tracker = make_shared< lbm::TimestepTracker >(0); + + AlternatingBeforeFunction communication(evenComm, oddComm, tracker); + + // add LBM sweep and communication to time loop + timeloop.add() << BeforeFunction(communication, "communication") + << Sweep(noSlip.getSweep(tracker), "noSlip boundary"); + timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary"); + timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary"); + timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement") + << Sweep(lbSweep.getSweep(tracker), "LB update rule"); + + // LBM stability check + timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >( + config, blocks, pdfFieldID, flagFieldId, fluidFlagUID)), + "LBM stability check"); + + // log remaining time + timeloop.addFuncAfterTimeStep( + timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), + "remaining time logger"); + + // add VTK output to time loop + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + +#if defined(WALBERLA_BUILD_WITH_CUDA) + vtkOutput->addBeforeFunction([&]() { + cuda::fieldCpy< VelocityField_T, GPUField >(blocks, velFieldID, velFieldIDGPU); + cuda::fieldCpy< ScalarField_T, GPUField >(blocks, densityFieldID, densityFieldIDGPU); + }); +#endif + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "velocity"); + auto densityWriter = make_shared< field::VTKWriter< ScalarField_T > >(densityFieldID, "density"); + FluidFilter_T filter(cellsPerBlock); + + auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T > >( + blocks, filter, velFieldID, "Q-Criterion"); + + vtkOutput->addCellDataWriter(velWriter); + vtkOutput->addCellDataWriter(densityWriter); + vtkOutput->addCellDataWriter(QCriterionWriter); + + timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + WcTimer simTimer; + + WALBERLA_LOG_INFO_ON_ROOT("Simulating flow around sphere:" + "\n timesteps: " + << timesteps << "\n reynolds number: " << reynolds_number + << "\n relaxation rate: " << omega << "\n maximum inflow velocity: " << u_max + << "\n diameter_sphere: " << diameter_sphere) + + simTimer.start(); + timeloop.run(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") + auto time = simTimer.last(); + auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); + auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess) + WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps)) + } + + return EXIT_SUCCESS; +} + +} // namespace walberla + +int main(int argc, char** argv) { walberla::main(argc, argv); } diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py new file mode 100644 index 0000000000000000000000000000000000000000..571f3129ef00a3b4a2a258059b25263170cc73fb --- /dev/null +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py @@ -0,0 +1,97 @@ +from pystencils.field import fields + +from lbmpy.advanced_streaming.utility import get_timesteps, Timestep +from lbmpy.macroscopic_value_kernels import macroscopic_values_setter +from lbmpy.stencils import get_stencil +from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule +from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow + +from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header + +from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler +from lbmpy_walberla import generate_boundary, generate_lb_pack_info +from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary + +import sympy as sp + +stencil = get_stencil("D3Q27") +q = len(stencil) +dim = len(stencil[0]) +streaming_pattern = 'esotwist' +timesteps = get_timesteps(streaming_pattern) + +pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx') +omega = sp.Symbol("omega") +u_max = sp.Symbol("u_max") + +output = { + 'density': density_field, + 'velocity': velocity_field +} + +opt = {'symbolic_field': pdfs, + 'cse_global': False, + 'cse_pdfs': False} + +method_params = {'method': 'cumulant', + 'stencil': stencil, + 'relaxation_rate': omega, + 'galilean_correction': True, + 'field_name': 'pdfs', + 'streaming_pattern': streaming_pattern, + 'output': output, + 'optimization': opt} + +collision_rule = create_lb_collision_rule(**method_params) +lb_method = collision_rule.method + +# getter & setter +setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs, density=1, + streaming_pattern=streaming_pattern, + previous_timestep=timesteps[0]) + +# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True} + +stencil_typedefs = {'Stencil_T': stencil} +field_typedefs = {'PdfField_T': pdfs, + 'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} + +with CodeGeneration() as ctx: + if ctx.cuda: + target = 'gpu' + else: + target = 'cpu' + + opt['target'] = target + + # sweeps + generate_alternating_lbm_sweep(ctx, 'FlowAroundSphereCodeGen_LbSweep', + collision_rule, streaming_pattern, optimization=opt) + generate_sweep(ctx, 'FlowAroundSphereCodeGen_MacroSetter', setter_assignments, target=target) + + # boundaries + ubb = UBB(lambda *args: None, dim=dim) + ubb_data_handler = UBBAdditionalDataHandler(stencil, ubb) + outflow = ExtrapolationOutflow(stencil[4], lb_method, streaming_pattern=streaming_pattern) + outflow_data_handler = OutflowAdditionalDataHandler(stencil, outflow, target=target) + + generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_UBB', ubb, lb_method, + target=target, streaming_pattern=streaming_pattern, + additional_data_handler=ubb_data_handler) + + generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_NoSlip', NoSlip(), lb_method, + target=target, streaming_pattern=streaming_pattern) + + generate_alternating_lbm_boundary(ctx, 'FlowAroundSphereCodeGen_Outflow', outflow, lb_method, + target=target, streaming_pattern=streaming_pattern, + additional_data_handler=outflow_data_handler) + + # communication + generate_lb_pack_info(ctx, 'FlowAroundSphereCodeGen_PackInfo', stencil, pdfs, + streaming_pattern=streaming_pattern, always_generate_separate_classes=True, target=target) + + # Info header containing correct template definitions for stencil and field + generate_info_header(ctx, 'FlowAroundSphereCodeGen_InfoHeader', + stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs) diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py new file mode 100644 index 0000000000000000000000000000000000000000..6d046735fd3867cb83670e13090d43f0e52166d4 --- /dev/null +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py @@ -0,0 +1,63 @@ +import waLBerla as wlb +from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity + + +class Scenario: + def __init__(self): + self.timesteps = 101 + self.vtkWriteFrequency = 2000 + + self.cells = (384, 128, 128) + self.blocks = (1, 1, 1) + self.periodic = (0, 0, 0) + + self.diameter_sphere = min(self.cells) // 2 + self.u_max = 0.1 + self.reynolds_number = 1000000 + + kinematic_viscosity = (self.diameter_sphere * self.u_max) / self.reynolds_number + + self.omega = relaxation_rate_from_lattice_viscosity(kinematic_viscosity) + + self.total_cells = (self.cells[0] * self.blocks[0], + self.cells[1] * self.blocks[1], + self.cells[2] * self.blocks[2]) + + @wlb.member_callback + def config(self): + return { + 'DomainSetup': { + 'blocks': self.blocks, + 'cellsPerBlock': self.cells, + 'periodic': self.periodic, + 'oneBlockPerProcess': True + }, + 'Parameters': { + 'timesteps': self.timesteps, + 'vtkWriteFrequency': self.vtkWriteFrequency, + 'omega': self.omega, + 'u_max': self.u_max, + 'reynolds_number': self.reynolds_number, + 'diameter_sphere': self.diameter_sphere + }, + 'Boundaries': { + 'Border': [ + {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'Outflow'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + ], + 'Body': [ + {'shape': "sphere", + 'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2), + 'radius': self.diameter_sphere // 2, + 'flag': 'NoSlip'} + ] + }, + } + + +scenarios = wlb.ScenarioManager() +scenarios.add(Scenario()) diff --git a/python/lbmpy_walberla/__init__.py b/python/lbmpy_walberla/__init__.py index 1a7b136a7935d13be6c47d8a101e9efc4c631f56..15de37c8112e16a21476ba3e388adc843af92956 100644 --- a/python/lbmpy_walberla/__init__.py +++ b/python/lbmpy_walberla/__init__.py @@ -1,5 +1,8 @@ -from .boundary import generate_boundary +from .boundary import generate_boundary, generate_alternating_lbm_boundary from .walberla_lbm_generation import RefinementScaling, generate_lattice_model from .packinfo import generate_lb_pack_info +from .alternating_sweeps import generate_alternating_lbm_sweep -__all__ = ['generate_lattice_model', 'RefinementScaling', 'generate_boundary', 'generate_lb_pack_info'] +__all__ = ['generate_lattice_model', 'generate_alternating_lbm_sweep', + 'RefinementScaling', 'generate_boundary', 'generate_alternating_lbm_boundary', + 'generate_lb_pack_info'] diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py index 03553a8f41ab9339b661354f323ba19c24cb6320..ac0dcd81233a5f387eed5912a1fe2338985869d1 100644 --- a/python/lbmpy_walberla/additional_data_handler.py +++ b/python/lbmpy_walberla/additional_data_handler.py @@ -1,11 +1,24 @@ from pystencils.stencil import inverse_direction from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index +from lbmpy.boundaries.boundaryconditions import LbBoundary from lbmpy.boundaries import ExtrapolationOutflow, UBB from pystencils_walberla.additional_data_handler import AdditionalDataHandler +def default_additional_data_handler(boundary_obj: LbBoundary, lb_method, field_name, target='cpu'): + if not boundary_obj.additional_data: + return None + + if isinstance(boundary_obj, UBB): + return UBBAdditionalDataHandler(lb_method.stencil, boundary_obj) + elif isinstance(boundary_obj, ExtrapolationOutflow): + return OutflowAdditionalDataHandler(lb_method.stencil, boundary_obj, target=target, field_name=field_name) + else: + raise ValueError(f"No default AdditionalDataHandler available for boundary of type {boundary_obj.__class__}") + + class UBBAdditionalDataHandler(AdditionalDataHandler): def __init__(self, stencil, boundary_object): assert isinstance(boundary_object, UBB) @@ -114,7 +127,7 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler): if self._dim == 2: pos.append("0") pos.append(str(numeric_index(pdf_accessor.accs[inv_dir])[0])) - result[f'pdf'] = ', '.join(pos) + result['pdf'] = ', '.join(pos) pos = [] for p, o, t in zip(position, offsets, tangential_offset): @@ -122,6 +135,6 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler): if self._dim == 2: pos.append("0") pos.append(str(numeric_index(pdf_accessor.accs[inv_dir])[0])) - result[f'pdf_nd'] = ', '.join(pos) + result['pdf_nd'] = ', '.join(pos) return result diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py new file mode 100644 index 0000000000000000000000000000000000000000..4bafc5dc800020fea52674695146cb20e001d0e2 --- /dev/null +++ b/python/lbmpy_walberla/alternating_sweeps.py @@ -0,0 +1,141 @@ +import numpy as np +from pystencils_walberla.codegen import generate_selective_sweep, get_vectorize_instruction_set +from pystencils_walberla.kernel_selection import ( + AbstractInterfaceArgumentMapping, AbstractConditionNode, KernelCallNode) +from pystencils import TypedSymbol +from lbmpy.creationfunctions import create_lb_ast +from lbmpy.advanced_streaming import Timestep, is_inplace + + +class EvenIntegerCondition(AbstractConditionNode): + def __init__(self, parameter_name: str, + branch_true, branch_false, + parameter_dtype=int): + self.parameter_symbol = TypedSymbol(parameter_name, parameter_dtype) + super(EvenIntegerCondition, self).__init__(branch_true, branch_false) + + @property + def selection_parameters(self): + return {self.parameter_symbol} + + @property + def condition_text(self): + return f"(({self.parameter_symbol.name} & 1) ^ 1)" + + +class OddIntegerCondition(AbstractConditionNode): + def __init__(self, parameter_name: str, + branch_true, branch_false, + parameter_dtype=int): + self.parameter_symbol = TypedSymbol(parameter_name, parameter_dtype) + super(OddIntegerCondition, self).__init__(branch_true, branch_false) + + @property + def selection_parameters(self): + return {self.parameter_symbol} + + @property + def condition_text(self): + return f"({self.parameter_symbol.name} & 1)" + + +class TimestepTrackerMapping(AbstractInterfaceArgumentMapping): + + def __init__(self, low_level_arg: TypedSymbol, tracker_identifier='tracker'): + self.tracker_symbol = TypedSymbol(tracker_identifier, 'std::shared_ptr<lbm::TimestepTracker> &') + self.high_level_args = (self.tracker_symbol,) + self.low_level_arg = low_level_arg + + @property + def mapping_code(self): + return f"{self.tracker_symbol.name}->getCounter()" + + @property + def headers(self): + return {'"lbm/inplace_streaming/TimestepTracker.h"'} + + +def generate_alternating_lbm_sweep(generation_context, class_name, collision_rule, streaming_pattern, + namespace='lbm', field_swaps=(), varying_parameters=(), + inner_outer_split=False, ghost_layers_to_include=0, optimization=None, + **create_ast_params): + """Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for + in-place streaming patterns, but can of course also be used with two-fields patterns (why make it + simple if you can make it complicated?). From a collision rule, two kernel implementations are + generated; one for even, and one for odd timesteps. At run time, the correct one is selected + according to a time step counter. This counter can be managed by the `walberla::lbm::TimestepTracker` + class. + + Args: + generation_context: See documentation of `pystencils_walberla.generate_sweep` + class_name: Name of the generated class + collision_rule: LbCollisionRule as returned by `lbmpy.create_lb_collision_rule`. + streaming_pattern: Name of the streaming pattern; see `lbmpy.advanced_streaming` + namespace: see documentation of `generate_sweep` + field_swaps: see documentation of `generate_sweep` + varying_parameters: see documentation of `generate_sweep` + inner_outer_split: see documentation of `generate_sweep` + ghost_layers_to_include: see documentation of `generate_sweep` + optimization: dictionary containing optimization parameters, c.f. `lbmpy.creationfunctions` + create_ast_params: Further parameters passed to `create_lb_ast` + """ + optimization = default_lbm_optimization_parameters(generation_context, optimization) + + target = optimization['target'] + if not generation_context.cuda and target == 'gpu': + return + + ast_even = create_lb_ast(collision_rule=collision_rule, streaming_pattern=streaming_pattern, + timestep=Timestep.EVEN, optimization=optimization, **create_ast_params) + ast_even.function_name = 'even' + kernel_even = KernelCallNode(ast_even) + + if is_inplace(streaming_pattern): + ast_odd = create_lb_ast(collision_rule=collision_rule, streaming_pattern=streaming_pattern, + timestep=Timestep.ODD, optimization=optimization, **create_ast_params) + ast_odd.function_name = 'odd' + kernel_odd = KernelCallNode(ast_odd) + else: + kernel_odd = kernel_even + + tree = EvenIntegerCondition('timestep', kernel_even, kernel_odd, np.uint8) + interface_mappings = [TimestepTrackerMapping(tree.parameter_symbol)] + + vec_info = optimization['vectorization'] + openmp = optimization['openmp'] + + generate_selective_sweep(generation_context, class_name, tree, + interface_mappings=interface_mappings, + target=target, namespace=namespace, + field_swaps=field_swaps, varying_parameters=varying_parameters, + inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, + cpu_vectorize_info=vec_info, cpu_openmp=openmp) + + +# ---------------------------------- Internal -------------------------------------------------------------------------- + + +def default_lbm_optimization_parameters(generation_context, params): + if params is None: + params = dict() + + params['target'] = params.get('target', 'cpu') + params['double_precision'] = params.get('double_precision', generation_context.double_accuracy) + params['openmp'] = params.get('cpu_openmp', generation_context.openmp) + params['vectorization'] = params.get('vectorization', {}) + + if isinstance(params['vectorization'], bool): + do_vectorization = params['vectorization'] + params['vectorization'] = dict() + else: + do_vectorization = True + + vec = params['vectorization'] + if isinstance(vec, dict): + default_vec_is = get_vectorize_instruction_set(generation_context) if do_vectorization else None + + vec['instruction_set'] = vec.get('instruction_set', default_vec_is) + vec['assume_inner_stride_one'] = vec.get('assume_inner_stride_one', True) + vec['assume_aligned'] = vec.get('assume_aligned', False) + vec['nontemporal'] = vec.get('nontemporal', False) + return params diff --git a/python/lbmpy_walberla/boundary.py b/python/lbmpy_walberla/boundary.py index b0dafb7b3b9835073101acfa33a611c71c62095d..795d8565616b8b2335f1c50ec960a170d638542b 100644 --- a/python/lbmpy_walberla/boundary.py +++ b/python/lbmpy_walberla/boundary.py @@ -2,6 +2,14 @@ import pystencils_walberla.boundary from lbmpy.boundaries.boundaryhandling import create_lattice_boltzmann_boundary_kernel from lbmpy.advanced_streaming import Timestep, is_inplace +from pystencils_walberla.kernel_selection import KernelCallNode +from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition, OddIntegerCondition, TimestepTrackerMapping +from lbmpy_walberla.additional_data_handler import default_additional_data_handler + +from pystencils.data_types import TypedSymbol + +import numpy as np + def generate_boundary(generation_context, class_name, @@ -9,28 +17,77 @@ def generate_boundary(generation_context, lb_method, field_name='pdfs', streaming_pattern='pull', - always_generate_separate_classes=False, + prev_timestep=Timestep.BOTH, additional_data_handler=None, + namespace='lbm', **create_kernel_params): - def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', openmp=True, **kwargs): + if boundary_object.additional_data and additional_data_handler is None: + target = create_kernel_params.get('target', 'cpu') + additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target) + + def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', **kwargs): + return create_lattice_boltzmann_boundary_kernel(field, index_field, lb_method, boundary_functor, + streaming_pattern=streaming_pattern, + prev_timestep=prev_timestep, + target=target, + **kwargs) + + pystencils_walberla.boundary.generate_boundary(generation_context, + class_name, + boundary_object, + field_name=field_name, + neighbor_stencil=lb_method.stencil, + index_shape=[len(lb_method.stencil)], + kernel_creation_function=boundary_creation_function, + namespace=namespace, + additional_data_handler=additional_data_handler, + **create_kernel_params) + + +def generate_alternating_lbm_boundary(generation_context, + class_name, + boundary_object, + lb_method, + field_name='pdfs', + streaming_pattern='pull', + after_collision=True, + additional_data_handler=None, + namespace='lbm', + **create_kernel_params): + if boundary_object.additional_data and additional_data_handler is None: + target = create_kernel_params.get('target', 'cpu') + additional_data_handler = default_additional_data_handler(boundary_object, lb_method, field_name, target=target) + + timestep_param_name = 'timestep' + timestep_param_dtype = np.uint8 + timestep_param = TypedSymbol(timestep_param_name, timestep_param_dtype) + + def boundary_creation_function(field, index_field, stencil, boundary_functor, target='cpu', **kwargs): pargs = (field, index_field, lb_method, boundary_functor) kwargs = {'target': target, **kwargs} - if is_inplace(streaming_pattern) or always_generate_separate_classes: - return { - 'EvenSweep': create_lattice_boltzmann_boundary_kernel(*pargs, - streaming_pattern=streaming_pattern, - prev_timestep=Timestep.EVEN, - **kwargs), - 'OddSweep': create_lattice_boltzmann_boundary_kernel(*pargs, - streaming_pattern=streaming_pattern, - prev_timestep=Timestep.ODD, - **kwargs) - } - else: - return create_lattice_boltzmann_boundary_kernel(*pargs, + ast_even = create_lattice_boltzmann_boundary_kernel(*pargs, streaming_pattern=streaming_pattern, - prev_timestep=Timestep.BOTH, + prev_timestep=Timestep.EVEN, **kwargs) + ast_even.function_name = 'even' + kernel_even = KernelCallNode(ast_even) + + if is_inplace(streaming_pattern): + ast_odd = create_lattice_boltzmann_boundary_kernel(*pargs, + streaming_pattern=streaming_pattern, + prev_timestep=Timestep.ODD, + **kwargs) + ast_odd.function_name = 'odd' + kernel_odd = KernelCallNode(ast_odd) + else: + kernel_odd = kernel_even + + if after_collision: + return EvenIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype) + else: + return OddIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype) + + interface_mappings = [TimestepTrackerMapping(timestep_param)] pystencils_walberla.boundary.generate_boundary(generation_context, class_name, @@ -39,6 +96,7 @@ def generate_boundary(generation_context, neighbor_stencil=lb_method.stencil, index_shape=[len(lb_method.stencil)], kernel_creation_function=boundary_creation_function, - namespace='lbm', + namespace=namespace, additional_data_handler=additional_data_handler, + interface_mappings=interface_mappings, **create_kernel_params) diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py index b6c310169019dcc215268c60cd90e914dfed3648..2c7892f9164feb57949aca3bfd9c976b1a1ae019 100644 --- a/python/pystencils_walberla/__init__.py +++ b/python/pystencils_walberla/__init__.py @@ -3,8 +3,10 @@ from .cmake_integration import CodeGeneration from .codegen import ( generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel, generate_mpidtype_info_from_kernel, generate_sweep, get_vectorize_instruction_set) +from .utility import generate_info_header __all__ = ['CodeGeneration', 'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info', 'generate_mpidtype_info_from_kernel', 'generate_staggered_boundary', 'generate_staggered_flux_boundary', - 'get_vectorize_instruction_set'] + 'get_vectorize_instruction_set', + 'generate_info_header'] diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py index 0d251236c0f9400bc2aea424e775aa1eb6c7e5a2..c81df201f43951031035c48771e3c891f3c36e33 100644 --- a/python/pystencils_walberla/boundary.py +++ b/python/pystencils_walberla/boundary.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import numpy as np from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils import Field, FieldType @@ -7,9 +6,12 @@ from pystencils.boundaries.createindexlist import ( boundary_index_array_coordinate_names, direction_member_name, numpy_data_type_for_boundary_object) from pystencils.data_types import TypedSymbol, create_type -from pystencils_walberla.codegen import KernelInfo, default_create_kernel_parameters +from pystencils_walberla.codegen import default_create_kernel_parameters from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env from pystencils_walberla.additional_data_handler import AdditionalDataHandler +from pystencils_walberla.kernel_selection import ( + KernelFamily, AbstractKernelSelectionNode, KernelCallNode, HighLevelInterfaceSpec) +from pystencils.astnodes import KernelFunction def generate_boundary(generation_context, @@ -23,6 +25,8 @@ def generate_boundary(generation_context, target='cpu', namespace='pystencils', additional_data_handler=None, + interface_mappings=(), + generate_functor=True, **create_kernel_params): if boundary_object.additional_data and additional_data_handler is None: @@ -51,37 +55,27 @@ def generate_boundary(generation_context, if not kernel_creation_function: kernel_creation_function = create_boundary_kernel - kernels = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object, **create_kernel_params) - if isinstance(kernels, dict): - sweep_to_kernel_info_dict = OrderedDict() - dummy_kernel_info = None - for sweep_class, sweep_kernel in kernels.items(): - sweep_kernel.function_name = "boundary_" + boundary_object.name + '_' + sweep_class - sweep_kernel.assumed_inner_stride_one = False - kernel_info = KernelInfo(sweep_kernel) - sweep_to_kernel_info_dict[sweep_class] = kernel_info - if dummy_kernel_info is None: - dummy_kernel_info = kernel_info - # elif not dummy_kernel_info.has_same_interface(kernel_info): - # raise ValueError("Multiple boundary sweeps must have the same kernel interface!") - multi_sweep = True - else: - multi_sweep = False - kernel = kernels + kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object, **create_kernel_params) + + if isinstance(kernel, KernelFunction): kernel.function_name = "boundary_" + boundary_object.name - kernel.assumed_inner_stride_one = False - kernel_info = KernelInfo(kernel) - sweep_to_kernel_info_dict = {'': kernel_info} - dummy_kernel_info = kernel_info + selection_tree = KernelCallNode(kernel) + elif isinstance(kernel, AbstractKernelSelectionNode): + selection_tree = kernel + else: + raise ValueError(f"kernel_creation_function returned wrong type: {kernel.__class__}") + + kernel_family = KernelFamily(selection_tree, class_name) + interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings) if additional_data_handler is None: additional_data_handler = AdditionalDataHandler(stencil=neighbor_stencil) context = { + 'kernel': kernel_family, 'class_name': boundary_object.name, - 'sweep_classes': sweep_to_kernel_info_dict, - 'multi_sweep': multi_sweep, - 'dummy_kernel_info': dummy_kernel_info, + 'interface_spec': interface_spec, + 'generate_functor': generate_functor, 'StructName': struct_name, 'StructDeclaration': struct_from_numpy_dtype(struct_name, index_struct_dtype), 'dim': dim, diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py index d8b7e615c6ef93d177fc3245661baded53252ec4..8533b21359a91fc52fa865185e5cf7242d8dad0c 100644 --- a/python/pystencils_walberla/cmake_integration.py +++ b/python/pystencils_walberla/cmake_integration.py @@ -64,9 +64,9 @@ def parse_json_args(): expected_files = parsed['EXPECTED_FILES'] cmake_vars = {} for key, value in parsed['CMAKE_VARS'].items(): - if value in ("ON", "1", "YES"): + if value in ("ON", "1", "YES", "TRUE"): value = True - elif value in ("OFF", "0", "NO"): + elif value in ("OFF", "0", "NO", "FALSE"): value = False cmake_vars[key] = value return expected_files, cmake_vars diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index f1a84798f2af17c254febf5e4c327964db824d91..27773a7e45d059ce322cb206ab3d92fae1e340df 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -12,6 +12,10 @@ from pystencils.backends.cbackend import get_headers from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets from pystencils.stencil import inverse_direction, offset_to_direction_string from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env +from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec +from pystencils.backends.cuda_backend import CudaSympyPrinter +from pystencils.kernelparameters import SHAPE_DTYPE +from pystencils.data_types import TypedSymbol __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel', 'generate_mpidtype_info_from_kernel', 'default_create_kernel_parameters', 'KernelInfo', @@ -50,49 +54,92 @@ def generate_sweep(generation_context, class_name, assignments, """ create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params) - if not generation_context.cuda and create_kernel_params['target'] == 'gpu': + target = create_kernel_params['target'] + if not generation_context.cuda and target == 'gpu': return if isinstance(assignments, KernelFunction): ast = assignments - create_kernel_params['target'] = ast.target + target = ast.target elif not staggered: ast = create_kernel(assignments, **create_kernel_params) else: ast = create_staggered_kernel(assignments, **create_kernel_params) - ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] - ast.nontemporal = create_kernel_params['cpu_vectorize_info']['nontemporal'] - ast.openmp = create_kernel_params['cpu_openmp'] + ast.function_name = class_name.lower() + + selection_tree = KernelCallNode(ast) + generate_selective_sweep(generation_context, class_name, selection_tree, target=target, namespace=namespace, + field_swaps=field_swaps, varying_parameters=varying_parameters, + inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, + cpu_vectorize_info=create_kernel_params['cpu_vectorize_info'], + cpu_openmp=create_kernel_params['cpu_openmp']) + + +def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None, + namespace='pystencils', field_swaps=(), varying_parameters=(), + inner_outer_split=False, ghost_layers_to_include=0, + cpu_vectorize_info=None, cpu_openmp=False): + """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple + pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`. + + Args: + generation_context: see documentation of `generate_sweep` + class_name: name of the generated sweep class + selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree + interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of + the selection tree + target: `None`, `'cpu'` or `'gpu'`; inferred from kernels if `None` is given. + namespace: see documentation of `generate_sweep` + field_swaps: see documentation of `generate_sweep` + varying_parameters: see documentation of `generate_sweep` + inner_outer_split: see documentation of `generate_sweep` + ghost_layers_to_include: see documentation of `generate_sweep` + cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels + cpu_openmp: Whether or not CPU kernels use OpenMP parallelization + """ def to_name(f): return f.name if isinstance(f, Field) else f field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps) temporary_fields = tuple(e[1] for e in field_swaps) - ast.function_name = class_name.lower() + kernel_family = KernelFamily(selection_tree, class_name, + temporary_fields, field_swaps, varying_parameters) + + if target is None: + target = kernel_family.get_ast_attr('target') + elif target != kernel_family.get_ast_attr('target'): + raise ValueError('Mismatch between target parameter and AST targets.') + + if not generation_context.cuda and target == 'gpu': + return + + representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} + representative_field = sorted(representative_field)[0] env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) add_pystencils_filters_to_jinja_env(env) - main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters) - representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter} - representative_field = sorted(representative_field)[0] + interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings) jinja_context = { - 'kernel': main_kernel_info, + 'kernel': kernel_family, 'namespace': namespace, 'class_name': class_name, - 'target': create_kernel_params.get("target", "cpu"), + 'target': target, 'field': representative_field, - 'headers': get_headers(ast), 'ghost_layers_to_include': ghost_layers_to_include, - 'inner_outer_split': inner_outer_split + 'inner_outer_split': inner_outer_split, + 'interface_spec': interface_spec, + 'generate_functor': True, + 'cpu_vectorize_info': cpu_vectorize_info, + 'cpu_openmp': cpu_openmp } header = env.get_template("Sweep.tmpl.h").render(**jinja_context) source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) - source_extension = "cpp" if create_kernel_params.get("target", "cpu") == "cpu" else "cu" + source_extension = "cpp" if target == "cpu" else "cu" generation_context.write_file("{}.h".format(class_name), header) generation_context.write_file("{}.{}".format(class_name, source_extension), source) @@ -327,11 +374,48 @@ class KernelInfo: self.varying_parameters = tuple(varying_parameters) self.parameters = ast.get_parameters() # cache parameters here - def has_same_interface(self, other): - return self.temporary_fields == other.temporary_fields \ - and self.field_swaps == other.field_swaps \ - and self.varying_parameters == other.varying_parameters \ - and self.parameters == other.parameters + @property + def fields_accessed(self): + return self.ast.fields_accessed + + def get_ast_attr(self, name): + """Returns the value of an attribute of the AST managed by this KernelInfo. + For compatibility with KernelFamily.""" + return self.ast.__getattribute__(name) + + def generate_kernel_invocation_code(self, **kwargs): + ast = self.ast + ast_params = self.parameters + is_cpu = self.ast.target == 'cpu' + call_parameters = ", ".join([p.symbol.name for p in ast_params]) + + if not is_cpu: + stream = kwargs.get('stream', '0') + spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ()) + + if not spatial_shape_symbols: + spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] + spatial_shape_symbols.sort(key=lambda e: e.coordinate) + else: + spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] + + assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\ + "Please only use kernels for generic field sizes!" + + indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) + sp_printer_c = CudaSympyPrinter() + kernel_call_lines = [ + "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['block']), + "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['grid']), + "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name, + stream, call_parameters), + ] + + return "\n".join(kernel_call_lines) + else: + return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" def get_vectorize_instruction_set(generation_context): diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index ad6d317878ed6ce650ad81c7223f6d744cba79a5..00fb7279e989838a4fdfc6f2d733596bdcfec79c 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -2,12 +2,9 @@ import jinja2 import sympy as sp # import re -from pystencils import TypedSymbol from pystencils.backends.cbackend import generate_c -from pystencils.backends.cuda_backend import CudaSympyPrinter -from pystencils.data_types import get_base_type +from pystencils.data_types import TypedSymbol, get_base_type from pystencils.field import FieldType -from pystencils.kernelparameters import SHAPE_DTYPE from pystencils.sympyextensions import prod temporary_fieldMemberTemplate = """ @@ -92,6 +89,24 @@ def generate_definition(kernel_info, target='cpu'): return result +def generate_declarations(kernel_family, target='cpu'): + declarations = [] + for ast in kernel_family.all_asts: + code = generate_c(ast, signature_only=True, dialect='cuda' if target == 'gpu' else 'c') + ";" + code = "namespace internal_%s {\n%s\n}\n" % (ast.function_name, code,) + declarations.append(code) + return "\n".join(declarations) + + +def generate_definitions(kernel_family, target='cpu'): + definitions = [] + for ast in kernel_family.all_asts: + code = generate_c(ast, dialect='cuda' if target == 'gpu' else 'c') + code = "namespace internal_%s {\nstatic %s\n}\n" % (ast.function_name, code) + definitions.append(code) + return "\n".join(definitions) + + def field_extraction_code(field, is_temporary, declaration_only=False, no_declaration=False, is_gpu=False, update_member=False): """Returns code string for getting a field pointer. @@ -178,7 +193,7 @@ def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignor @jinja2.contextfilter -def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=None, stream='0', +def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, stream='0', spatial_shape_symbols=()): """Generates the function call to a pystencils kernel @@ -199,16 +214,23 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non may be necessary. """ assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0 - ast = kernel_info.ast - ast_params = kernel_info.parameters - is_cpu = ctx['target'] == 'cpu' + ast_params = kernel.parameters + vec_info = ctx.get('cpu_vectorize_info', None) + instruction_set = kernel.get_ast_attr('instruction_set') + if vec_info: + assume_inner_stride_one = vec_info['assume_inner_stride_one'] + nontemporal = vec_info['nontemporal'] + else: + assume_inner_stride_one = nontemporal = False + cpu_openmp = ctx.get('cpu_openmp', False) + kernel_ghost_layers = kernel.get_ast_attr('ghost_layers') ghost_layers_to_include = sp.sympify(ghost_layers_to_include) - if ast.ghost_layers is None: + if kernel_ghost_layers is None: required_ghost_layers = 0 else: # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. ) - required_ghost_layers = max(max(ast.ghost_layers)) + required_ghost_layers = max(max(kernel_ghost_layers)) kernel_call_lines = [] @@ -251,15 +273,15 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non coordinates = tuple(coordinates) kernel_call_lines.append(f"{param.symbol.dtype} {param.symbol.name} = {param.field_name}->dataAt" f"({coordinates[0]}, {coordinates[1]}, {coordinates[2]}, {coordinates[3]});") - if ast.assumed_inner_stride_one and field.index_dimensions > 0: + if assume_inner_stride_one and field.index_dimensions > 0: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({param.field_name}->layout(), field::fzyx);") - if ast.instruction_set and ast.assumed_inner_stride_one: - if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set: + if instruction_set and assume_inner_stride_one: + if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" - f"{ast.instruction_set['cachelineSize']}, 0);") + f"{instruction_set['cachelineSize']}, 0);") else: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" - f"{ast.instruction_set['bytes']}, 0);") + f"{instruction_set['bytes']}, 0);") elif param.is_field_stride: casted_stride = get_field_stride(param) type_str = param.symbol.dtype.base_name @@ -273,38 +295,19 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non max_value = f"{field.name}->{('x', 'y', 'z')[coord]}SizeWithGhostLayer()" kernel_call_lines.append(f"WALBERLA_ASSERT_GREATER_EQUAL({max_value}, {shape});") kernel_call_lines.append(f"const {type_str} {param.symbol.name} = {shape};") - if ast.assumed_inner_stride_one and field.index_dimensions > 0: + if assume_inner_stride_one and field.index_dimensions > 0: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL({field.name}->layout(), field::fzyx);") - if ast.instruction_set and ast.assumed_inner_stride_one: - if ast.nontemporal and ast.openmp and 'cachelineZero' in ast.instruction_set: + if instruction_set and assume_inner_stride_one: + if nontemporal and cpu_openmp and 'cachelineZero' in instruction_set: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" - f"{ast.instruction_set['cachelineSize']}, 0);") + f"{instruction_set['cachelineSize']}, 0);") else: kernel_call_lines.append(f"WALBERLA_ASSERT_EQUAL((uintptr_t) {field.name}->dataAt(0, 0, 0, 0) %" - f"{ast.instruction_set['bytes']}, 0);") + f"{instruction_set['bytes']}, 0);") - call_parameters = ", ".join([p.symbol.name for p in ast_params]) + kernel_call_lines.append(kernel.generate_kernel_invocation_code(stream=stream, + spatial_shape_symbols=spatial_shape_symbols)) - if not is_cpu: - if not spatial_shape_symbols: - spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] - spatial_shape_symbols.sort(key=lambda e: e.coordinate) - else: - spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] - - assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\ - "Please be only use kernels for generic field sizes!" - - indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) - sp_printer_c = CudaSympyPrinter() - kernel_call_lines += [ - "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']), - "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']), - "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name, - stream, call_parameters), - ] - else: - kernel_call_lines.append("internal_%s::%s(%s);" % (ast.function_name, ast.function_name, call_parameters)) return "\n".join(kernel_call_lines) @@ -373,8 +376,7 @@ def generate_constructor_call_arguments(kernel_info, parameters_to_ignore=None): @jinja2.contextfilter def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False): - ast = kernel_info.ast - fields = {f.name: f for f in ast.fields_accessed} + fields = {f.name: f for f in kernel_info.fields_accessed} params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields) params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters) @@ -424,9 +426,58 @@ def nested_class_method_definition_prefix(ctx, nested_class_name): return outer_class + '::' + nested_class_name +def generate_list_of_expressions(expressions, prepend=''): + if len(expressions) == 0: + return '' + return prepend + ", ".join(expressions) + + +def type_identifier_list(nested_arg_list): + """ + Filters a nested list of strings and TypedSymbols and returns a comma-separated string. + Strings are passed through as they are, but TypedSymbols are formatted as C-style + 'type identifier' strings, e.g. 'uint32_t ghost_layers'. + """ + result = [] + + def recursive_flatten(list): + for s in list: + if isinstance(s, str): + result.append(s) + elif isinstance(s, TypedSymbol): + result.append(f"{s.dtype} {s.name}") + else: + recursive_flatten(s) + + recursive_flatten(nested_arg_list) + return ", ".join(result) + + +def identifier_list(nested_arg_list): + """ + Filters a nested list of strings and TypedSymbols and returns a comma-separated string. + Strings are passed through as they are, but TypedSymbols are replaced by their name. + """ + result = [] + + def recursive_flatten(list): + for s in list: + if isinstance(s, str): + result.append(s) + elif isinstance(s, TypedSymbol): + result.append(s.name) + else: + recursive_flatten(s) + + recursive_flatten(nested_arg_list) + return ", ".join(result) + + def add_pystencils_filters_to_jinja_env(jinja_env): jinja_env.filters['generate_definition'] = generate_definition jinja_env.filters['generate_declaration'] = generate_declaration + jinja_env.filters['generate_definitions'] = generate_definitions + jinja_env.filters['generate_declarations'] = generate_declarations jinja_env.filters['generate_members'] = generate_members jinja_env.filters['generate_constructor_parameters'] = generate_constructor_parameters jinja_env.filters['generate_constructor_initializer_list'] = generate_constructor_initializer_list @@ -437,3 +488,6 @@ def add_pystencils_filters_to_jinja_env(jinja_env): jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters jinja_env.filters['generate_destructor'] = generate_destructor jinja_env.filters['nested_class_method_definition_prefix'] = nested_class_method_definition_prefix + jinja_env.filters['type_identifier_list'] = type_identifier_list + jinja_env.filters['identifier_list'] = identifier_list + jinja_env.filters['list_of_expressions'] = generate_list_of_expressions diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py new file mode 100644 index 0000000000000000000000000000000000000000..0e2becd6f5efdbced366cdaed5b2d4ec536ae3e1 --- /dev/null +++ b/python/pystencils_walberla/kernel_selection.py @@ -0,0 +1,343 @@ +from typing import Sequence, Set +from collections import OrderedDict +from functools import reduce +from jinja2.filters import do_indent +from pystencils import TypedSymbol +from pystencils.backends.cbackend import get_headers +from pystencils.backends.cuda_backend import CudaSympyPrinter +from pystencils.kernelparameters import SHAPE_DTYPE + + +""" + +This module contains several classes and methods supporting the code generation of sweeps +containing multiple kernels. + +A sweep class may be backed by multiple kernel implementations with (nearly) the same interface. +Those might be slightly altered versions of a single kernel, and the one to be executed may +be selected at run time according to certain criteria. An example are the even/odd alternating +kernels required for lattice Boltzmann inplace streaming. Executing multiple kernels in sequence +within a single sweep could also be a use case, which is however not yet implemented. + +## The Kernel Selection Tree + +The selection of the correct kernel is modelled by a tree structure, spanned by instances of +subclasses of `AbstractKernelSelectionNode`. The code generator traverses this tree and generates +code for invoking kernels according to its structure and nodes. +Currently, two types of nodes exist: + +- Condition Nodes: Subclasses of `AbstractConditionNode` manifest as if/else statements + in the generated code. They model decision points and have two subtrees; one for the + `true` case, and one for `false`. A basic implementation is `SimpleBooleanCondition`, + which simply branches according to a boolean value. Each condition node requires a number + of selection parameters. The total of selection parameters of the entire tree are collected + during code generation, and must all be passed to the generated C++ functions. +- Kernel Call Nodes: Currently, `KernelCallNode` corresponds to the invocation of a single + kernel. `KernelCallNode` acts as a wrapper for a single pystencils AST. When encountered + in the tree during code generation, the function call for this AST is inserted. + + +## The Kernel Family + +The `KernelFamily` class is a wrapper around a kernel selection tree. It was developed as a +generalization of the `KernelInfo` class. Its purpose is the collection and management of +information about the tree and its kernels which is required for code generation. +It also checks the tree's ASTs for consistency; for example by making sure that any fields +and symbols required by multiple ASTs have the same type, dimensions, et cetera. + + +## High-Level Interface + +Due to the tree's selection arguments, which must be passed to the methods wrapping the +kernel calls, the generated class can not by itself be used as a sweep functor to be passed +to the waLBerla timeloop. Instead, for all sweep types, a `get[...]Sweep` member function is +generated. It takes any required selection arguments, and returns a lambda function which +can be passed directly to the timeloop. + +Using the interface mapping system, the 'low-level' selection arguments of the kernel tree +can be hidden behind higher-level arguments. Such argument mappings are modelled by the +subclasses of `AbstractInterfaceArgumentMapping`. During code generation, they are organized +in an instance of `HighLevelInterfaceSpec`, which is used to generate the high-level interface. + + +""" + +# ---------------------------------- Selection Tree -------------------------------------------------------------------- + + +class AbstractKernelSelectionNode: + + def collect_kernel_calls(self): + raise NotImplementedError() + + @property + def selection_parameters(self) -> Set[TypedSymbol]: + raise NotImplementedError() + + def collect_selection_parameters(self) -> Set[TypedSymbol]: + return self.selection_parameters + + def get_selection_parameter_list(self) -> Sequence[TypedSymbol]: + all_params = self.collect_selection_parameters() + all_names = set(p.name for p in all_params) + if len(all_names) < len(all_params): + raise ValueError('There existed selection parameters of same name, but different type.') + return sorted(all_params, key=lambda x: x.name) + + def get_code(self, **kwargs) -> str: + raise NotImplementedError() + + +class AbstractConditionNode(AbstractKernelSelectionNode): + def __init__(self, branch_true, branch_false): + self.branch_true = branch_true + self.branch_false = branch_false + + @property + def condition_text(self) -> str: + raise NotImplementedError() + + def collect_kernel_calls(self): + return self.branch_true.collect_kernel_calls() | self.branch_false.collect_kernel_calls() + + def collect_selection_parameters(self) -> Set[TypedSymbol]: + return self.selection_parameters.union(self.branch_true.collect_selection_parameters(), + self.branch_false.collect_selection_parameters()) + + def get_code(self, **kwargs): + true_branch_code = self.branch_true.get_code(**kwargs) + false_branch_code = self.branch_false.get_code(**kwargs) + + true_branch_code = do_indent(true_branch_code, width=4, indentfirst=True) + false_branch_code = do_indent(false_branch_code, width=4, indentfirst=True) + + code = f"if({self.condition_text}) {{\n" + code += true_branch_code + code += "\n} else {\n" + code += false_branch_code + code += "\n}" + return code + + +class KernelCallNode(AbstractKernelSelectionNode): + def __init__(self, ast): + self.ast = ast + self.parameters = ast.get_parameters() # cache parameters here + + @property + def selection_parameters(self) -> Set[TypedSymbol]: + return set() + + def collect_kernel_calls(self): + return {self} + + def get_code(self, **kwargs): + ast = self.ast + ast_params = self.parameters + is_cpu = self.ast.target == 'cpu' + call_parameters = ", ".join([p.symbol.name for p in ast_params]) + + if not is_cpu: + stream = kwargs.get('stream', '0') + spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ()) + + if not spatial_shape_symbols: + spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] + spatial_shape_symbols.sort(key=lambda e: e.coordinate) + else: + spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] + + assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\ + "Please only use kernels for generic field sizes!" + + indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) + sp_printer_c = CudaSympyPrinter() + kernel_call_lines = [ + "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['block']), + "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) + for e in indexing_dict['grid']), + "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name, + stream, call_parameters), + ] + + return "\n".join(kernel_call_lines) + else: + return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" + + +class SimpleBooleanCondition(AbstractConditionNode): + def __init__(self, + parameter_name: str, + branch_true: AbstractKernelSelectionNode, + branch_false: AbstractKernelSelectionNode): + self.parameter_symbol = TypedSymbol(parameter_name, bool) + super(SimpleBooleanCondition, self).__init__(branch_true, branch_false) + + @property + def selection_parameters(self) -> Set[TypedSymbol]: + return {self.parameter_symbol} + + @property + def condition_text(self) -> str: + return self.parameter_symbol.name + + +# ---------------------------------- Kernel Family --------------------------------------------------------------------- + + +class KernelFamily: + def __init__(self, kernel_selection_tree: AbstractKernelSelectionNode, + class_name: str, + temporary_fields=(), field_swaps=(), varying_parameters=()): + self.kernel_selection_tree = kernel_selection_tree + self.kernel_selection_parameters = kernel_selection_tree.get_selection_parameter_list() + self.temporary_fields = tuple(temporary_fields) + self.field_swaps = tuple(field_swaps) + self.varying_parameters = tuple(varying_parameters) + + all_kernel_calls = self.kernel_selection_tree.collect_kernel_calls() + all_param_lists = [k.parameters for k in all_kernel_calls] + asts_list = [k.ast for k in all_kernel_calls] + self.representative_ast = asts_list[0] + + # Eliminate duplicates + self.all_asts = set(asts_list) + + # Check function names for uniqueness and reformat them + # using the class name + function_names = [ast.function_name.lower() for ast in self.all_asts] + unique_names = set(function_names) + if len(unique_names) < len(function_names): + raise ValueError('Function names of kernel family members must be unique!') + + prefix = class_name.lower() + for ast in self.all_asts: + ast.function_name = prefix + '_' + ast.function_name + + all_fields = [k.ast.fields_accessed for k in all_kernel_calls] + + # Collect function parameters and accessed fields + self.parameters = merge_lists_of_symbols(all_param_lists) + self.fields_accessed = reduce(lambda x, y: x | y, all_fields) + + self._ast_attrs = dict() + + def get_ast_attr(self, name): + """Returns the value of an attribute of the ASTs managed by this KernelFamily only + if it is the same in all ASTs.""" + if name not in self._ast_attrs: + attr = self.representative_ast.__getattribute__(name) + for ast in self.all_asts: + if ast.__getattribute__(name) != attr: + raise ValueError(f'Inconsistency in kernel family: Attribute {name} was different in {ast}!') + self._ast_attrs[name] = attr + return self._ast_attrs[name] + + def get_headers(self): + all_headers = [get_headers(ast) for ast in self.all_asts] + return reduce(merge_sorted_lists, all_headers) + + def generate_kernel_invocation_code(self, **kwargs): + return self.kernel_selection_tree.get_code(**kwargs) + + +# --------------------------- High-Level Sweep Interface Specification ------------------------------------------------ + + +class AbstractInterfaceArgumentMapping: + def __init__(self, high_level_args: Sequence[TypedSymbol], low_level_arg: TypedSymbol): + self.high_level_args = high_level_args + self.low_level_arg = low_level_arg + + @property + def mapping_code(self): + raise NotImplementedError() + + @property + def headers(self): + return set() + + +class IdentityMapping(AbstractInterfaceArgumentMapping): + + def __init__(self, low_level_arg: TypedSymbol): + self.high_level_args = (low_level_arg,) + self.low_level_arg = low_level_arg + + @property + def mapping_code(self): + return self.low_level_arg.name + + +def _create_interface_mapping_dict(low_level_args: Sequence[TypedSymbol], + mappings: Sequence[AbstractInterfaceArgumentMapping]): + mapping_dict = OrderedDict() + for m in mappings: + larg = m.low_level_arg + if larg not in low_level_args: + raise ValueError(f'Low-level argument {larg} did not exist.') + if larg.name in mapping_dict: + raise ValueError(f'More than one mapping was given for low-level argument {larg}') + mapping_dict[larg.name] = m + + for arg in low_level_args: + mapping_dict.setdefault(arg.name, IdentityMapping(arg)) + + return mapping_dict + + +class HighLevelInterfaceSpec: + def __init__(self, low_level_args: Sequence[TypedSymbol], + mappings: Sequence[AbstractInterfaceArgumentMapping]): + self.low_level_args = low_level_args + mapping_dict = _create_interface_mapping_dict(low_level_args, mappings) + self.mappings = mapping_dict.values() + high_level_args = OrderedDict() + self.mapping_codes = [] + self.headers = set() + for larg in low_level_args: + m = mapping_dict[larg.name] + self.mapping_codes.append(m.mapping_code) + self.headers |= m.headers + for harg in m.high_level_args: + if high_level_args.setdefault(harg.name, harg) != harg: + raise ValueError(f'High-Level Argument {harg} was given multiple times with different types.') + + self.high_level_args = list(high_level_args.values()) + + +# ---------------------------------- Helpers -------------------------------------------------------------------------- + + +def merge_sorted_lists(lx, ly, sort_key=lambda x: x, identity_check_key=None): + if identity_check_key is None: + identity_check_key = sort_key + nx = len(lx) + ny = len(ly) + + def recursive_merge(lx, ly, ix, iy): + if ix == nx: + return ly[iy:] + if iy == ny: + return lx[ix:] + x = lx[ix] + y = ly[iy] + skx = sort_key(x) + sky = sort_key(y) + if skx == sky: + if identity_check_key(x) == identity_check_key(y): + return [x] + recursive_merge(lx, ly, ix + 1, iy + 1) + else: + raise ValueError(f'Elements <{x}> and <{y}> with equal sort key where not identical!') + elif skx < sky: + return [x] + recursive_merge(lx, ly, ix + 1, iy) + else: + return [y] + recursive_merge(lx, ly, ix, iy + 1) + return recursive_merge(lx, ly, 0, 0) + + +def merge_lists_of_symbols(lists): + def merger(lx, ly): + return merge_sorted_lists(lx, ly, sort_key=lambda x: x.symbol.name, identity_check_key=lambda x: x.symbol) + return reduce(merger, lists) diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.cpp b/python/pystencils_walberla/templates/Boundary.tmpl.cpp index ef1ec3b7b860f3ac0fd4b290e1e5fe33bd7f772e..382047df7e2abaad65ac2ae2f45adde353b8266d 100644 --- a/python/pystencils_walberla/templates/Boundary.tmpl.cpp +++ b/python/pystencils_walberla/templates/Boundary.tmpl.cpp @@ -50,12 +50,7 @@ namespace {{namespace}} { #pragma diag_suppress = declared_but_not_referenced #endif - -{% for sweep_class, sweep_kernel in sweep_classes.items() %} - -{{sweep_kernel|generate_definition}} - -{% endfor %} +{{kernel|generate_definitions(target)}} #ifdef __GNUC__ #pragma GCC diagnostic pop @@ -65,46 +60,52 @@ namespace {{namespace}} { #pragma pop #endif -{% for sweep_class, sweep_kernel in sweep_classes.items() %} - -void {{sweep_class|nested_class_method_definition_prefix}}::run( IBlock * block, IndexVectors::Type type {% if target == 'gpu'%}, cudaStream_t stream {%endif%}) +void {{class_name}}::run_impl( + {{- ["IBlock * block", "IndexVectors::Type type", + kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] + | type_identifier_list -}} +) { - auto * indexVectors = block->getData<IndexVectors>(indexVectorID); - int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); - if( indexVectorSize == 0) - return; - - {% if target == 'gpu' -%} - auto pointer = indexVectors->pointerGpu(type); - {% else %} - auto pointer = indexVectors->pointerCpu(type); - {% endif %} - - uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); - - {{sweep_kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}} - {{sweep_kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }} - {{sweep_kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}} + auto * indexVectors = block->getData<IndexVectors>(indexVectorID); + int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() ); + if( indexVectorSize == 0) + return; + + {% if target == 'gpu' -%} + auto pointer = indexVectors->pointerGpu(type); + {% else %} + auto pointer = indexVectors->pointerCpu(type); + {% endif %} + + uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer); + + {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}} + {{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }} + {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}} } -void {{sweep_class|nested_class_method_definition_prefix}}::operator() ( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} ) +void {{class_name}}::run( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} +) { - run( block, IndexVectors::ALL{% if target == 'gpu'%}, stream {%endif%}); + run_impl( {{- ["block", "IndexVectors::ALL", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); } -void {{sweep_class|nested_class_method_definition_prefix}}::inner( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} ) +void {{class_name}}::inner( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} +) { - run( block, IndexVectors::INNER{% if target == 'gpu'%}, stream {%endif%} ); + run_impl( {{- ["block", "IndexVectors::INNER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); } -void {{sweep_class|nested_class_method_definition_prefix}}::outer( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} ) +void {{class_name}}::outer( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} +) { - run( block, IndexVectors::OUTER{% if target == 'gpu'%}, stream {%endif%} ); + run_impl( {{- ["block", "IndexVectors::OUTER", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); } -{% endfor %} - } // namespace {{namespace}} } // namespace walberla diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h index c0a39c32b0ae8c08202ae7f0641f03db0ad4e592..e22d96a796ac9e1420a4d69b5b9122081a655dbf 100644 --- a/python/pystencils_walberla/templates/Boundary.tmpl.h +++ b/python/pystencils_walberla/templates/Boundary.tmpl.h @@ -34,6 +34,10 @@ #include <set> #include <vector> +{% for header in interface_spec.headers %} +#include {{header}} +{% endfor %} + #ifdef __GNUC__ #define RESTRICT __restrict__ #elif _MSC_VER @@ -108,46 +112,55 @@ public: {%- endif %} }; - {% if multi_sweep %} - {% for sweep_class_name, sweep_kernel_info in sweep_classes.items() %} - class {{sweep_class_name}} + {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks, + {{kernel|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}}{{additional_data_handler.constructor_arguments}}) + :{{additional_data_handler.initialiser_list}} {{ kernel|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }} { - public: - {{sweep_class_name}} ( {{sweep_kernel_info|generate_constructor_parameters(['indexVectorSize'])}} ) - : {{ sweep_kernel_info|generate_constructor_initializer_list(['indexVectorSize']) }} {}; - - void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - - private: - void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - - {{sweep_kernel_info|generate_members(['indexVectorSize'])|indent(12)}} + auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; + indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}"); }; - {{sweep_class_name}} get{{sweep_class_name}} () + void run ( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} + ); + + {% if generate_functor -%} + void operator() ( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} + ) { - return {{sweep_class_name}} ( {{sweep_kernel_info|generate_constructor_call_arguments(['indexVectorSize'])|indent(12)}} ); + run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); } - {% endfor %} - {% endif %} + {%- endif %} - {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks, - {{dummy_kernel_info|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}}{{additional_data_handler.constructor_arguments}}) - :{{additional_data_handler.initialiser_list}} {{ dummy_kernel_info|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }} - { - auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); }; - indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}"); - }; + void inner ( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} + ); + + void outer ( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} + ); - {% if not multi_sweep %} + std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] + (IBlock * b) + { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; + } - void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); + std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] + (IBlock * b) + { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; + } - {% endif %} + std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] + (IBlock * b) + { this->outer( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; + } template<typename FlagField_T> void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID, @@ -208,14 +221,16 @@ public: } private: - {% if not multi_sweep %} - void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%}); - {% endif %} + void run_impl( + {{- ["IBlock * block", "IndexVectors::Type type", + kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] + | type_identifier_list -}} + ); BlockDataID indexVectorID; {{additional_data_handler.additional_member_variable|indent(4)}} public: - {{dummy_kernel_info|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}} + {{kernel|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}} }; diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp index 9711be8cde6d7a94475c8172cb858c03631f60d4..d70096e97faa2a9ff0da26fbb996b5f5b5764ac1 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp +++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp @@ -23,7 +23,8 @@ #include "core/DataTypes.h" #include "core/Macros.h" #include "{{class_name}}.h" -{% for header in headers %} + +{% for header in kernel.get_headers() %} #include {{header}} {% endfor %} @@ -53,9 +54,9 @@ namespace walberla { namespace {{namespace}} { -{{kernel|generate_definition(target)}} +{{kernel|generate_definitions(target)}} -void {{class_name}}::operator()( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) +void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} {{kernel|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True)|indent(4) }} @@ -64,10 +65,11 @@ void {{class_name}}::operator()( IBlock * block{%if target is equalto 'gpu'%} , } -void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks, - const CellInterval & globalCellInterval, - cell_idx_t ghostLayers, - IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) +void {{class_name}}::runOnCellInterval( + {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block", + kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] + | type_identifier_list -}} +) { CellInterval ci = globalCellInterval; CellInterval blockBB = blocks->getBlockCellBB( *block); @@ -84,7 +86,7 @@ void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> } {%if inner_outer_split%} -void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) +void {{class_name}}::inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} @@ -96,7 +98,7 @@ void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaS } -void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} ) +void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h index 9f0711a1e807553b2ca9cea2fa5dc5790c9c91af..093e2332ac4e5925305701a4f0d54887f18dffe1 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.h +++ b/python/pystencils_walberla/templates/Sweep.tmpl.h @@ -34,6 +34,10 @@ #include "domain_decomposition/StructuredBlockStorage.h" #include <set> +{% for header in interface_spec.headers %} +#include {{header}} +{% endfor %} + #ifdef __GNUC__ #define RESTRICT __restrict__ #elif _MSC_VER @@ -61,52 +65,100 @@ public: {{ kernel| generate_destructor(class_name) |indent(4) }} - void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} ); - void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks, - const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block - {%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %}); + void run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ); + + void runOnCellInterval( + {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", "cell_idx_t ghostLayers", "IBlock * block", + kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] + | type_identifier_list -}} + ); + + {% if generate_functor %} + void operator() ( + {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} + ) + { + run( {{- ["block", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); + } + {% endif %} + static std::function<void (IBlock *)> getSweep( + {{- ["const shared_ptr<" + class_name + "> & kernel", kernel.kernel_selection_parameters] | type_identifier_list -}} + ) + { + return [ {{- [ ['kernel'], kernel.kernel_selection_parameters ] | identifier_list -}} ] + (IBlock * b) + { kernel->run( {{- [ ['b'], kernel.kernel_selection_parameters] | identifier_list -}} ); }; + } + static std::function<void (IBlock* {%- if target is equalto 'gpu' %}, cudaStream_t {% endif -%} )> getSweepOnCellInterval( + {{- ["const shared_ptr<" + class_name + "> & kernel", "const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", + kernel.kernel_selection_parameters, 'cell_idx_t ghostLayers=1'] + | type_identifier_list -}} + ) + { + return [ {{- ["kernel", "blocks", "globalCellInterval", "ghostLayers", kernel.kernel_selection_parameters] | identifier_list -}} ] + (IBlock * b{%- if target is equalto 'gpu'%}, cudaStream_t stream = nullptr{% endif -%}) + { kernel->runOnCellInterval( + {{- ["blocks", "globalCellInterval", "ghostLayers", "b", kernel.kernel_selection_parameters, ["stream"] if target == 'gpu' else []] + | identifier_list + -}} + ); }; + } - static std::function<void (IBlock*)> getSweep(const shared_ptr<{{class_name}}> & kernel) { - return [kernel](IBlock * b) { (*kernel)(b); }; + std::function<void (IBlock *)> getSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- ["this", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] + (IBlock * b) + { this->run( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; } - static std::function<void (IBlock*{%if target is equalto 'gpu'%} , cudaStream_t {% endif %})> - getSweepOnCellInterval(const shared_ptr<{{class_name}}> & kernel, - const shared_ptr<StructuredBlockStorage> & blocks, - const CellInterval & globalCellInterval, - cell_idx_t ghostLayers=1 ) + std::function<void (IBlock *)> getSweepOnCellInterval( + {{- ["const shared_ptr<StructuredBlockStorage> & blocks", "const CellInterval & globalCellInterval", + interface_spec.high_level_args, 'cell_idx_t ghostLayers=1', ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] + | type_identifier_list -}} + ) { - return [kernel, blocks, globalCellInterval, ghostLayers] (IBlock * b{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %}) { - kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b{%if target is equalto 'gpu'%}, stream {% endif %}); - }; + return [ {{- ["this", "blocks", "globalCellInterval", "ghostLayers", interface_spec.high_level_args, ["stream"] if target == 'gpu' else []] | identifier_list -}} ] + (IBlock * b) + { this->runOnCellInterval( {{- ["blocks", "globalCellInterval", "ghostLayers", "b", interface_spec.mapping_codes, ["stream"] if target == 'gpu' else []] | identifier_list -}} ); }; } {% if inner_outer_split %} + void inner( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ); - void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} ); - void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = nullptr{% endif %} ); + void outer( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ); - void setOuterPriority(int priority ) { - {%if target is equalto 'gpu'%} - parallelStreams_.setStreamPriority(priority); - {%endif%} + std::function<void (IBlock *)> getInnerSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] + (IBlock * b) + { this->inner( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; + } + + std::function<void (IBlock *)> getOuterSweep( {{- [interface_spec.high_level_args, ["cudaStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} ) + { + return [ {{- [ ['this'], interface_spec.high_level_args, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ] + (IBlock * b) + { this->outer( {{- [ ['b'], interface_spec.mapping_codes, ["stream"] if target == 'gpu' else [] ] | identifier_list -}} ); }; + } + + void setOuterPriority(int priority) + { + {%if target is equalto 'gpu'%} + parallelStreams_.setStreamPriority(priority); + {%endif%} } {{kernel|generate_members|indent(4)}} private: - {%if target is equalto 'gpu'%} - cuda::ParallelStreams parallelStreams_; - {% endif %} + {%if target is equalto 'gpu' -%} cuda::ParallelStreams parallelStreams_; {%- endif %} Cell outerWidth_; std::vector<CellInterval> layers_; - -{%- else -%} +{% else %} {{ kernel|generate_members|indent(4) }} -{% endif -%} - +{% endif %} }; diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..0bdf6e9923fe4fb106ab2975109a7c31a7f0d633 --- /dev/null +++ b/python/pystencils_walberla/utility.py @@ -0,0 +1,91 @@ +from os import path +from pystencils.data_types import get_base_type +from pystencils_walberla.cmake_integration import CodeGenerationContext + +HEADER_EXTENSIONS = {'.h', '.hpp'} + + +def generate_info_header(ctx: CodeGenerationContext, + filename: str, + stencil_typedefs: dict = None, + field_typedefs: dict = None, + additional_headers: set = None, + headers_to_ignore: set = None, + additional_typedefs: dict = None): + """Generates an info header, consolidating required information about the generated code. + The info header #includes all generated header files, and is thus the only header the + application needs to #include. It can also contain aliases for waLBerla stencil types and + instances of the GhostLayerField template. + + Args: + ctx: Code Generation Context + filename: Name of the generated info header file + stencil_typedefs: dict mapping type names to stencil names or tuples + field_typedefs: dict mapping type names to pystencils `Field` instances + additional_headers: additional header files to be included + headers_to_ignore: headers which should not be included + additional_typedefs: dict mapping aliases to types. + """ + stencil_typedefs = stencil_typedefs if stencil_typedefs is not None else dict() + field_typedefs = field_typedefs if field_typedefs is not None else dict() + additional_typedefs = additional_typedefs if additional_typedefs is not None else dict() + + additional_headers = additional_headers if additional_headers is not None else set() + headers_to_ignore = headers_to_ignore if headers_to_ignore is not None else set() + + headers_in_ctx = set(_filter_headers(ctx.files_written)) + + stencil_headers, stencil_typedefs = _stencil_inclusion_code(stencil_typedefs) + field_headers, field_typedefs = _field_inclusion_code(field_typedefs) + + headers_to_include = stencil_headers | field_headers | headers_in_ctx | additional_headers + headers_to_include = sorted(headers_to_include - headers_to_ignore) + headers_to_include = [f'#include "{header}"' for header in headers_to_include] + + typedefs = {**stencil_typedefs, **field_typedefs, **additional_typedefs} + typedefs = [f"using {alias} = {typename};" for alias, typename in typedefs.items()] + + lines = '\n'.join(headers_to_include + [''] + typedefs) + '\n' + + if path.splitext(filename)[1] not in HEADER_EXTENSIONS: + filename += '.h' + + ctx.write_file(filename, lines) + + +# ------------------------------------- INTERNAL ------------------------------------------------------------- + + +def _filter_headers(filepaths): + for p in filepaths: + if path.splitext(p)[1] in HEADER_EXTENSIONS: + yield path.split(p)[1] + + +def _stencil_inclusion_code(stencil_typedefs): + headers = set() + typedefs = dict() + for typename, stencil in stencil_typedefs.items(): + if isinstance(stencil, tuple): + dim = len(stencil[0]) + q = len(stencil) + stencil = f"D{dim}Q{q}" + elif not isinstance(stencil, str): + raise ValueError(f'Invalid stencil: Do not know what to make of {stencil}') + + headers.add(f"stencil/{stencil}.h") + typedefs[typename] = f"walberla::stencil::{stencil}" + + return headers, typedefs + + +def _field_inclusion_code(field_typedefs): + headers = set() + typedefs = dict() + for typename, field in field_typedefs.items(): + f_size = field.values_per_cell() + dtype = get_base_type(field.dtype) + headers.add("field/GhostLayerField.h") + typedefs[typename] = f"walberla::field::GhostLayerField<{dtype}, {f_size}>" + + return headers, typedefs diff --git a/src/lbm/inplace_streaming/TimestepTracker.h b/src/lbm/inplace_streaming/TimestepTracker.h new file mode 100644 index 0000000000000000000000000000000000000000..c0ca22635fea5d9a454f99e2d4fe08a519578eef --- /dev/null +++ b/src/lbm/inplace_streaming/TimestepTracker.h @@ -0,0 +1,50 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \\file TimestepTracker.h +//! \\ingroup lbm +//! \\author Frederik Hennig <frederik.hennig@fau.de> +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" + +namespace walberla +{ +namespace lbm +{ +class TimestepTracker +{ + private: + uint8_t counter_; + + public: + TimestepTracker() : counter_(0) {} + TimestepTracker(uint8_t zeroth_timestep) : counter_(zeroth_timestep & 1) {} + + void advance() { counter_ = (counter_ + 1) & 1; } + + std::function< void() > getAdvancementFunction() + { + return [this]() { this->advance(); }; + } + + uint8_t getCounter() const { return counter_; } + +}; // class TimestepTracker + +} // namespace lbm +} // namespace walberla diff --git a/tests/lbm/CMakeLists.txt b/tests/lbm/CMakeLists.txt index b67913654b775abb7fe5b9d246bf6be5bb52d7d3..2ce27af42009821fcf9e920d4582ab521104c0f8 100644 --- a/tests/lbm/CMakeLists.txt +++ b/tests/lbm/CMakeLists.txt @@ -124,4 +124,25 @@ waLBerla_generate_target_from_python(NAME LbmPackInfoGenerationTestCodegen FILE waLBerla_link_files_to_builddir( "diff_packinfos.sh" ) waLBerla_execute_test( NAME LbmPackInfoGenerationDiffTest COMMAND bash diff_packinfos.sh ) + +waLBerla_generate_target_from_python( NAME InplaceStreamingCodegen + FILE codegen/InplaceStreamingCodegen.py + OUT_FILES PullSweep.h PullSweep.cpp + PullNoSlip.h PullNoSlip.cpp + PullUBB.h PullUBB.cpp + PullOutflow.h PullOutflow.cpp + PullInit.h PullInit.cpp + InPlaceSweep.h InPlaceSweep.cpp + InPlaceNoSlip.h InPlaceNoSlip.cpp + InPlaceUBB.h InPlaceUBB.cpp + InPlaceOutflow.h InPlaceOutflow.cpp + InPlaceInit.h InPlaceInit.cpp + InplaceStreamingCodegen.h) + +waLBerla_compile_test( FILES codegen/InplaceStreamingCodegenTest.cpp + DEPENDS InplaceStreamingCodegen ) + +waLBerla_execute_test( NAME InplaceStreamingCodegenTest + COMMAND $<TARGET_FILE:InplaceStreamingCodegenTest> ${CMAKE_CURRENT_SOURCE_DIR}/codegen/InplaceStreamingCodegen3D.prm ) + endif() diff --git a/tests/lbm/codegen/InplaceStreamingCodegen.py b/tests/lbm/codegen/InplaceStreamingCodegen.py new file mode 100644 index 0000000000000000000000000000000000000000..7fcef0f4215805cfa40f68d6fe82f6d3cfbba1c6 --- /dev/null +++ b/tests/lbm/codegen/InplaceStreamingCodegen.py @@ -0,0 +1,92 @@ +from lbmpy_walberla import generate_alternating_lbm_sweep, generate_boundary, generate_alternating_lbm_boundary +from lbmpy_walberla.additional_data_handler import OutflowAdditionalDataHandler +from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header + +from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_ast +from lbmpy.macroscopic_value_kernels import macroscopic_values_setter +from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow +from lbmpy.advanced_streaming import Timestep + +from pystencils import Field +from lbmpy.stencils import get_stencil + +# Common Setup + +stencil = get_stencil('D3Q27') +dim = len(stencil[0]) +q = len(stencil) +target = 'cpu' +inplace_pattern = 'aa' +two_fields_pattern = 'pull' +namespace = 'lbmpy' + +f_field = Field.create_generic('f', dim, index_shape=(q,), layout='fzyx') +f_field_tmp = Field.create_generic('f_tmp', dim, index_shape=(q,), layout='fzyx') +u_field = Field.create_generic('u', dim, index_shape=(dim,), layout='fzyx') + +output = { + 'velocity': u_field +} + +method_params = { + 'stencil': stencil, + 'method': 'srt', + 'relaxation_rate': 1.5, + 'output': output +} + +optimization = { + 'target': target, + 'symbolic_field': f_field, + 'symbolic_temporary_field': f_field_tmp +} + +collision_rule = create_lb_collision_rule(**method_params) +lb_method = collision_rule.method +noslip = NoSlip() +ubb = UBB((0.05,) + (0,) * (dim - 1)) + +outflow_normal = (1,) + (0,) * (dim - 1) +outflow_pull = ExtrapolationOutflow(outflow_normal, lb_method, streaming_pattern=two_fields_pattern) + +outflow_inplace = ExtrapolationOutflow(outflow_normal, lb_method, streaming_pattern=inplace_pattern) + +init_velocity = (0, ) * dim + +init_kernel_pull = macroscopic_values_setter(lb_method, 1, init_velocity, f_field, streaming_pattern=two_fields_pattern) +init_kernel_inplace = macroscopic_values_setter( + lb_method, 1, init_velocity, f_field, streaming_pattern=inplace_pattern, previous_timestep=Timestep.ODD) + +stencil_typedefs = {'Stencil_T': stencil} +field_typedefs = {'PdfField_T': f_field, 'VelocityField_T': u_field} + +with CodeGeneration() as ctx: + # Pull-Pattern classes + ast_pull = create_lb_ast(collision_rule=collision_rule, + streaming_pattern=two_fields_pattern, optimization=optimization) + generate_sweep(ctx, 'PullSweep', ast_pull, field_swaps=[(f_field, f_field_tmp)], namespace=namespace) + + generate_boundary(ctx, 'PullNoSlip', noslip, lb_method, + streaming_pattern=two_fields_pattern, target=target, namespace=namespace) + generate_boundary(ctx, 'PullUBB', ubb, lb_method, streaming_pattern=two_fields_pattern, + target=target, namespace=namespace) + generate_boundary(ctx, 'PullOutflow', outflow_pull, lb_method, + streaming_pattern=two_fields_pattern, target=target, namespace=namespace) + + generate_sweep(ctx, 'PullInit', init_kernel_pull, target=target, namespace=namespace) + + # Inplace Pattern classes + generate_alternating_lbm_sweep(ctx, 'InPlaceSweep', collision_rule, inplace_pattern, + optimization=optimization, namespace=namespace) + + generate_alternating_lbm_boundary(ctx, 'InPlaceNoSlip', noslip, lb_method, streaming_pattern=inplace_pattern, + after_collision=True, target=target, namespace=namespace) + generate_alternating_lbm_boundary(ctx, 'InPlaceUBB', ubb, lb_method, streaming_pattern=inplace_pattern, + after_collision=True, target=target, namespace=namespace) + generate_alternating_lbm_boundary(ctx, 'InPlaceOutflow', outflow_inplace, lb_method, streaming_pattern=inplace_pattern, + after_collision=True, target=target, namespace=namespace) + + generate_sweep(ctx, 'InPlaceInit', init_kernel_inplace, target=target, namespace=namespace) + + generate_info_header(ctx, "InplaceStreamingCodegen.h", + stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs) diff --git a/tests/lbm/codegen/InplaceStreamingCodegen2D.prm b/tests/lbm/codegen/InplaceStreamingCodegen2D.prm new file mode 100644 index 0000000000000000000000000000000000000000..08b9584d69ea211bc4cbbac0c85b70980843ade9 --- /dev/null +++ b/tests/lbm/codegen/InplaceStreamingCodegen2D.prm @@ -0,0 +1,33 @@ + +Parameters +{ + timesteps 50; + vtkWriteFrequency 0; +} + +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 30, 30, 1 >; + periodic < 0, 0, 0 >; +} + +Boundaries +{ + Border { direction W; + walldistance -1; + flag UBB; + ghostLayersToInitialize 0; } + Border { direction E; + walldistance -1; + flag Outflow; + ghostLayersToInitialize 0; } + Border { direction N; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } + Border { direction S; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } +} diff --git a/tests/lbm/codegen/InplaceStreamingCodegen3D.prm b/tests/lbm/codegen/InplaceStreamingCodegen3D.prm new file mode 100644 index 0000000000000000000000000000000000000000..0fa8f50f1ee88c521401e345c8f88ca7dd645a5c --- /dev/null +++ b/tests/lbm/codegen/InplaceStreamingCodegen3D.prm @@ -0,0 +1,45 @@ + +Parameters +{ + timesteps 70; + vtkWriteFrequency 0; +} + +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 30, 30, 30 >; + periodic < 0, 0, 0 >; +} + +Boundaries +{ + Border { direction W; + walldistance -1; + flag UBB; } + + Border { direction E; + walldistance -1; + flag Outflow; + ghostLayersToInitialize 0; } + + Border { direction N; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } + + Border { direction S; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } + + Border { direction T; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } + + Border { direction B; + walldistance -1; + flag NoSlip; + ghostLayersToInitialize 1; } +} diff --git a/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp b/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..718042efa2b1edb911052328a28cdb210ab65de9 --- /dev/null +++ b/tests/lbm/codegen/InplaceStreamingCodegenTest.cpp @@ -0,0 +1,163 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file InplaceStreamingCodegenTest.cpp +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== + +#include "InplaceStreamingCodegen.h" + +#include "blockforest/all.h" + +#include "core/Environment.h" + +#include "field/all.h" + +#include "geometry/InitBoundaryHandling.h" + +#include "lbm/inplace_streaming/TimestepTracker.h" + +#include "timeloop/SweepTimeloop.h" + +#include <string> + +using namespace walberla::lbmpy; + +namespace walberla +{ +typedef uint8_t boundaryFlag_t; +typedef FlagField< boundaryFlag_t > FlagField_T; + +int main(int argc, char** argv) +{ + Environment walberlaEnv(argc, argv); + auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config()); + + auto params = walberlaEnv.config()->getBlock("Parameters"); + uint_t timesteps = params.getParameter< uint_t >("timesteps"); + uint_t vtkWriteFrequency = params.getParameter< uint_t >("vtkWriteFrequency"); + + BlockDataID pullPdfFieldID = field::addToStorage< PdfField_T >(blocks, "f_pull", real_c(0.0), field::fzyx); + BlockDataID inplacePdfFieldID = field::addToStorage< PdfField_T >(blocks, "f_inplace", real_c(0.0), field::fzyx); + + BlockDataID pullVelocityFieldID = field::addToStorage< VelocityField_T >(blocks, "u_pull", real_c(0.0), field::fzyx); + BlockDataID inplaceVelocityFieldID = + field::addToStorage< VelocityField_T >(blocks, "u_inplace", real_c(0.0), field::fzyx); + + BlockDataID boundaryFlagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "boundaryFlagField", 1); + + // Set up boundary handling + + const FlagUID fluidFlagUID("Fluid"); + const FlagUID noslipFlagUID("NoSlip"); + const FlagUID ubbFlagUID("UBB"); + const FlagUID outflowFlagUID("Outflow"); + + auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries"); + + geometry::initBoundaryHandling< FlagField_T >(*blocks, boundaryFlagFieldID, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, boundaryFlagFieldID, fluidFlagUID); + + PullNoSlip pullNoSlip(blocks, pullPdfFieldID); + pullNoSlip.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, noslipFlagUID, fluidFlagUID); + + PullUBB pullUBB(blocks, pullPdfFieldID); + pullUBB.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, ubbFlagUID, fluidFlagUID); + + PullOutflow pullOutflow(blocks, pullPdfFieldID); + pullOutflow.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, outflowFlagUID, fluidFlagUID); + + InPlaceNoSlip inplaceNoSlip(blocks, inplacePdfFieldID); + inplaceNoSlip.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, noslipFlagUID, fluidFlagUID); + + InPlaceUBB inplaceUBB(blocks, inplacePdfFieldID); + inplaceUBB.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, ubbFlagUID, fluidFlagUID); + + InPlaceOutflow inplaceOutflow(blocks, inplacePdfFieldID); + inplaceOutflow.fillFromFlagField< FlagField_T >(blocks, boundaryFlagFieldID, outflowFlagUID, fluidFlagUID); + + // Init fields + + PullInit pullInit(pullPdfFieldID); + InPlaceInit inplaceInit(inplacePdfFieldID); + + for (auto b = blocks->begin(); b != blocks->end(); b++) + { + pullInit(&(*b)); + inplaceInit(&(*b)); + } + + // Set up sweeps + + PullSweep pullLbmSweep(pullPdfFieldID, pullVelocityFieldID); + InPlaceSweep inplaceLbmSweep(inplacePdfFieldID, inplaceVelocityFieldID); + + auto tracker = make_shared< lbm::TimestepTracker >(1); + + // Pull Timeloop + + SweepTimeloop pullTimeloop(blocks, timesteps); + pullTimeloop.add() << Sweep(pullUBB); + pullTimeloop.add() << Sweep(pullNoSlip); + pullTimeloop.add() << Sweep(pullOutflow); + pullTimeloop.add() << Sweep(pullLbmSweep); + + if (vtkWriteFrequency > 0) + { + pullTimeloop.addFuncAfterTimeStep(field::createVTKOutput< VelocityField_T, float >( + pullVelocityFieldID, *blocks, "pull velocity", vtkWriteFrequency, uint_c(0), false, "pull_vtk")); + } + + pullTimeloop.run(); + + // In-Place Timeloop + + SweepTimeloop inplaceTimeloop(blocks, timesteps); + inplaceTimeloop.add() << Sweep(inplaceUBB.getSweep(tracker)); + inplaceTimeloop.add() << Sweep(inplaceNoSlip.getSweep(tracker)); + inplaceTimeloop.add() << Sweep(inplaceOutflow.getSweep(tracker)) << AfterFunction(tracker->getAdvancementFunction()); + inplaceTimeloop.add() << Sweep(inplaceLbmSweep.getSweep(tracker)); + + if (vtkWriteFrequency > 0) + { + inplaceTimeloop.addFuncAfterTimeStep(field::createVTKOutput< VelocityField_T, float >( + inplaceVelocityFieldID, *blocks, "inplace velocity", vtkWriteFrequency, uint_c(0), false, "inplace_vtk")); + } + + inplaceTimeloop.run(); + + // Validate results + + for (auto b = blocks->begin(); b != blocks->end(); b++) + { + auto pullVelocityField = b->getData< VelocityField_T >(pullVelocityFieldID); + auto inplaceVelocityField = b->getData< VelocityField_T >(inplaceVelocityFieldID); + + CellInterval fieldSize = pullVelocityField->xyzSize(); + for (Cell c : fieldSize) + { + for (uint_t i = 0; i < pullVelocityField->fSize(); i++) + { + WALBERLA_CHECK_FLOAT_EQUAL(pullVelocityField->get(c, i), inplaceVelocityField->get(c, i)); + } + } + } + + return 0; +} +} // namespace walberla + +int main(int argc, char** argv) { return walberla::main(argc, argv); }