Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 121-buffersystem-receiver-info-without-sender-ranks
  • 3-stable
  • 4-stable
  • AddaptTypeSystem
  • CMakeCodeGenPartTwo
  • ChannelFlow
  • CoVortex
  • CodegenForRefinement
  • CommunicationGPUBenchmark
  • ComnbinedGPUPackinfo
  • ExportCudaDeviceSelection
  • FixSinglePrecisionProblems
  • FlagFieldExample
  • FlowAroundSphere
  • FreeSurface
  • GPURefineTest
  • GPURefinement
  • GPURefinementImprovement
  • HRR
  • HydroPressure
  • IBC
  • InterpolationBC
  • Italy
  • LDC
  • Lagoon
  • LeesEdwards
  • ListLBM
  • NewChannelBenchmark
  • Remove_fSize_from_templates
  • SphereMovie
  • TGA
  • TaylorBubble
  • TurbulentChannel
  • UpgradePystencils
  • VTKUnstructured
  • clang11
  • develop
  • develop2
  • fluidizedbed_showcase
  • master
  • phaseField
  • phasefield-drop
  • porous
  • porousHeat
  • remiPorous
  • s2a
  • setup_walberla_codegen
  • vbondmodel_integrated
  • vbondmodel_isotropic
  • v3.1
  • v3.2
  • v3.3
  • v4.0dev
  • v4.1
  • v4.2
  • v5.0dev
56 results

Target

Select target project
  • castellsc/walberla
  • ravi.k.ayyala/walberla
  • em73etav/walberla
  • hoenig/walberla
  • le45zyci/walberla
  • sudesh.rathnayake/walberla
  • el38efib/walberla
  • rahil.doshi/walberla
  • Bindgen/walberla
  • ob28imeq/walberla
  • ArashPartow/walberla
  • jarmatz/walberla
  • jbadwaik/walberla
  • ec93ujoh/walberla
  • walberla/walberla
  • ProjectPhysX/walberla
  • shellshocked2003/walberla
  • stewart/walberla
  • behzad.safaei/walberla
  • schruff/walberla
  • loreson/walberla
  • Novermars/walberla
  • itischler/walberla
  • holzer/walberla
  • da15siwa/walberla
  • he66coqe/walberla
  • jngrad/walberla
  • uq60ifih/walberla
  • ostanin/walberla
  • bauer/walberla
  • zy79zopo/walberla
  • jonas_schmitt/walberla
  • po60nani/walberla
  • ro36vugi/walberla
  • fweik/walberla
  • ab04unyc/walberla
  • yw25ynew/walberla
  • ig38otak/walberla
  • RudolfWeeber/walberla
39 results
Select Git revision
  • 121-buffersystem-receiver-info-without-sender-ranks
  • 128-some-tests-are-not-active
  • 146-cuda-gcc-config-warning
  • 3-stable
  • 4-stable
  • 5-stable
  • 6-stable
  • 7-stable
  • 727-refactor-sqlExport
  • AtomicAdd_for_CUDA_compute_capabilities<6.0
  • ChargedParticles
  • CodegenForRefinement
  • GeneratedOutflowBC
  • RayleighBernardConvection
  • Remove_fSize_from_templates
  • UpdateGPUBenchmark
  • UpdatePhaseField
  • angersbach/coding-day-01-09
  • antidunes-visualization
  • bam_piping_erosion
  • benchmark_sqlite_modify
  • change-default-layout-fzyx
  • clang-tidy
  • clang11
  • clang_tidy2
  • cmake_cleanup
  • cnt_app
  • codegen-update
  • coding-day-01-09-mesh
  • coupling_tutorial
  • doshi/coding-day-01-09
  • externalize_dependencies
  • fix_nvcc_compiler_warnings
  • fluidizedbed_showcase
  • hip-ShiftedPeriodicity
  • kajol/coding-day
  • kemmler/particle_coupling_GPU
  • lbmpy-kernel-comparison
  • master
  • plewinski/fix-Guo-force-model-TRT-MRT
  • pystencils2.0-adoption
  • rangersbach/doxygen_style
  • ravi/coding-day
  • ravi/material_transport
  • setup_walberla_codegen
  • suction_bucket
  • suffa/NorthWind
  • suffa/NorthWind_refined
  • suffa/SYCL
  • suffa/Sparse
  • suffa/compact_interpolation
  • suffa/fix_force_on_boundary
  • suffa/integrate_moving_geo
  • suffa/psm_lbm_package
  • thermalFreeSurfaceLBM
  • thoennes/cusotm-mpi-reduce-function
  • use-correct-codegen-data-type
  • viscLDCwithFSLBM
  • v3.1
  • v3.2
  • v3.3
  • v4.0dev
  • v4.1
  • v4.2
  • v5.0dev
  • v5.1
  • v6.0dev
  • v6.1
  • v7.0dev
  • v7.1
70 results
Show changes
Showing
with 2377 additions and 67 deletions
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_generate_target_from_python(NAME TaylorGreenVortexGenerated
FILE TaylorGreenVortex.py
OUT_FILES TaylorGreenVortexStorageSpecification.h TaylorGreenVortexStorageSpecification.cpp
TaylorGreenVortexSweepCollection.h TaylorGreenVortexSweepCollection.cpp
NoSlip.h NoSlip.cpp
UBB.h UBB.cpp
TaylorGreenVortexBoundaryCollection.h
TaylorGreenVortexHeader.h)
waLBerla_add_executable ( NAME TaylorGreenVortex
FILES TaylorGreenVortex.cpp
DEPENDS TaylorGreenVortexGenerated blockforest core field geometry lbm_generated timeloop )
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file TaylorGreenVortex.cpp
//! \author Markus Holzer <markus.holzer@fau.de>
//! \brief TaylorGreenVortex
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "blockforest/loadbalancing/StaticCurve.h"
#include "core/DataTypes.h"
#include "core/Environment.h"
#include "core/MemoryUsage.h"
#include "core/SharedFunctor.h"
#include "core/debug/TestSubsystem.h"
#include "core/logging/Initialization.h"
#include "core/math/Vector3.h"
#include "core/timing/RemainingTimeLogger.h"
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/GhostLayerField.h"
#include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h"
#include "timeloop/SweepTimeloop.h"
#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "lbm_generated/field/AddToStorage.h"
#include "lbm_generated/field/PdfField.h"
// include the generated header file. It includes all generated classes
#include "TaylorGreenVortexHeader.h"
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-variable"
# pragma GCC diagnostic ignored "-Wignored-qualifiers"
#endif
using namespace walberla;
using namespace std::placeholders;
using StorageSpecification_T = lbm::TaylorGreenVortexStorageSpecification;
using Stencil_T = StorageSpecification_T::Stencil;
using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
using PackInfo_T = lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >;
using SweepCollection_T = lbm::TaylorGreenVortexSweepCollection;
using VectorField_T = GhostLayerField< real_t, StorageSpecification_T::Stencil::D >;
using ScalarField_T = GhostLayerField< real_t, 1 >;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField< flag_t >;
using BoundaryCollection_T = lbm::TaylorGreenVortexBoundaryCollection< FlagField_T >;
using blockforest::communication::UniformBufferedScheme;
struct IDs
{
BlockDataID pdfField;
BlockDataID velocityField;
BlockDataID densityField;
BlockDataID avgVelField;
BlockDataID avgVelSqrField;
BlockDataID avgPressureField;
BlockDataID flagField;
BlockDataID pdfFieldGPU;
BlockDataID velocityFieldGPU;
BlockDataID densityFieldGPU;
};
struct Setup
{
Vector3< uint_t > blocks;
Vector3< uint_t > cellsPerBlock;
Vector3< uint_t > cells;
Vector3< bool > periodic;
uint_t numGhostLayers;
};
class AccuracyEvaluation
{
public:
AccuracyEvaluation(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
SweepCollection_T& sweepCollection, const real_t maxLatticeVelocity, const uint_t logFrequency,
const bool logToStream, const bool logToFile)
: blocks_(blocks), ids_(ids), sweepCollection_(sweepCollection), maxLatticeVelocity_(maxLatticeVelocity),
executionCounter_(uint_c(0)), filename_("TaylorGreenVortex.txt"), normalizationFactor_(real_c(1.0))
{
logFrequency_ = logToStream ? logFrequency : uint_c(0);
plotFrequency_ = logToFile ? logFrequency : uint_c(0);
}
void setNormalizationFactor(const real_t f) { normalizationFactor_ = f; }
void setFilename(const std::string& filename) { filename_ = filename; }
real_t L1() const { return L1_; }
real_t L2() const { return L2_; }
real_t Lmax() const { return Lmax_; }
void operator()()
{
if (logFrequency_ == uint_c(0) && (plotFrequency_ == uint_c(0) || filename_.empty())) return;
++executionCounter_;
const bool plot = (plotFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % plotFrequency_ == uint_c(0) &&
!filename_.empty());
const bool log = (logFrequency_ != uint_c(0) && (executionCounter_ - uint_c(1)) % logFrequency_ == uint_c(0));
if (!log && !plot) return;
const auto& domainAABB = blocks_->getDomain();
real_t _L1(real_c(0.0));
real_t _L2(real_c(0.0));
real_t _Lmax(real_c(0.0));
for (auto block = blocks_->begin(); block != blocks_->end(); ++block)
{
sweepCollection_.calculateMacroscopicParameters(block.get());
const VectorField_T* velocityField = block->template getData< const VectorField_T >(ids_.velocityField);
const auto level = blocks_->getLevel(*block);
const real_t volumeFraction =
(blocks_->dx(level) * blocks_->dy(level) * blocks_->dz(level)) / domainAABB.volume();
for (auto it = velocityField->beginXYZ(); it != velocityField->end(); ++it)
{
Vector3< real_t > center = blocks_->getBlockLocalCellCenter(*block, Cell(it.x(), it.y(), it.z()));
Vector3< real_t > exactVelocity(
real_c(maxLatticeVelocity_ * (center[1] - real_c(domainAABB.yMin())) / real_c(domainAABB.ySize())),
real_c(0.0), real_c(0.0));
Vector3< real_t > velocity(it.getF(0), it.getF(1), it.getF(2));
const auto error = velocity - exactVelocity;
const real_t diff = error.length();
_L1 += diff * volumeFraction;
_L2 += diff * diff * volumeFraction;
_Lmax = std::max(_Lmax, diff);
}
}
mpi::reduceInplace(_L1, mpi::SUM);
mpi::reduceInplace(_L2, mpi::SUM);
mpi::reduceInplace(_Lmax, mpi::MAX);
_L2 = std::sqrt(_L2);
L1_ = _L1;
L2_ = _L2;
Lmax_ = _Lmax;
WALBERLA_ROOT_SECTION()
{
if (plot && executionCounter_ == uint_t(1))
{
std::ofstream file(filename_.c_str());
file << "# accuracy evaluation"
<< "# step [1], L1 [2], L2 [3], Lmax [4]" << std::endl;
file.close();
}
if (log)
{
WALBERLA_LOG_INFO("Evaluation of accuracy:"
<< "\n - L1: " << L1_ << "\n - L2: " << L2_ << "\n - Lmax: " << Lmax_);
}
if (plot)
{
std::ofstream file(filename_.c_str(), std::ofstream::out | std::ofstream::app);
file << (executionCounter_ - uint_t(1)) << " " << L1_ << " " << L2_ << " " << Lmax_ << std::endl;
file.close();
}
}
};
private:
std::shared_ptr< StructuredBlockForest > blocks_;
IDs ids_;
SweepCollection_T& sweepCollection_;
real_t maxLatticeVelocity_;
uint_t executionCounter_;
uint_t plotFrequency_;
uint_t logFrequency_;
std::string filename_;
real_t normalizationFactor_;
real_t L1_;
real_t L2_;
real_t Lmax_;
}; // class AccuracyEvaluation
namespace
{
void workloadMemoryAndSUIDAssignment(SetupBlockForest& forest, const memory_t memoryPerBlock)
{
for (auto block = forest.begin(); block != forest.end(); ++block)
{
block->setWorkload(numeric_cast< workload_t >(uint_t(1) << block->getLevel()));
block->setMemory(memoryPerBlock);
}
}
shared_ptr< SetupBlockForest >createSetupBlockForest(const Setup& setup, uint_t numberOfProcesses, const memory_t memoryPerCell)
{
shared_ptr< SetupBlockForest > forest = make_shared< SetupBlockForest >();
const memory_t memoryPerBlock =
numeric_cast< memory_t >((setup.cellsPerBlock[0] + uint_t(2) * setup.numGhostLayers) *
(setup.cellsPerBlock[1] + uint_t(2) * setup.numGhostLayers) *
(setup.cellsPerBlock[2] + uint_t(2) * setup.numGhostLayers)) *
memoryPerCell;
forest->addWorkloadMemorySUIDAssignmentFunction(
std::bind(workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock));
forest->init(
AABB(real_c(0), real_c(0), real_c(0), real_c(setup.cells[0]), real_c(setup.cells[1]), real_c(setup.cells[2])),
setup.blocks[0], setup.blocks[1], setup.blocks[2], setup.periodic[0], setup.periodic[1], setup.periodic[2]);
MPIManager::instance()->useWorldComm();
forest->balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numberOfProcesses);
// WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << *forest);
return forest;
}
shared_ptr< blockforest::StructuredBlockForest >createStructuredBlockForest(const Setup& setup, const memory_t memoryPerCell)
{
// WALBERLA_LOG_INFO_ON_ROOT("Creating the block structure ...");
shared_ptr< SetupBlockForest > sforest = createSetupBlockForest(setup, uint_c(MPIManager::instance()->numProcesses()), memoryPerCell);
auto bf = std::make_shared< blockforest::BlockForest >(uint_c(MPIManager::instance()->rank()), *sforest, false);
auto sbf = std::make_shared< blockforest::StructuredBlockForest >(bf, setup.cellsPerBlock[0], setup.cellsPerBlock[1],
setup.cellsPerBlock[2]);
sbf->createCellBoundingBoxes();
return sbf;
}
} // namespace
class StreamCollide
{
public:
StreamCollide(const shared_ptr< StructuredBlockForest > & blocks, const IDs& ids, const real_t omega) : ids_(ids)
{
auto pdfsID = ids.pdfField;
_nBlocks = blocks->getNumberOfBlocks();
blocks->getBlocks(blockVector, 0);
for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
{
const double level_scale_factor = double(uint_t(1) << level);
const double one = double(1.0);
const double half = double(0.5);
omegaVector.push_back( double(omega / (level_scale_factor * (-omega * half + one) + omega * half)) );
}
BlockDataID flagID = field::addFlagFieldToStorage< FlagField< uint8_t > >(blocks, "counter flags", uint_c(1));
for( auto it = blocks->begin(); it != blocks->end(); ++it )
{
Block * block = dynamic_cast< Block * >( it.get() );
auto * flagField = block->getData< FlagField< uint8_t > > ( flagID );
const uint8_t baseFlag = flagField->registerFlag(FlagUID("base"));
const uint8_t xFlagP = flagField->registerFlag(FlagUID("xP"));
const uint8_t xFlagM = flagField->registerFlag(FlagUID("xM"));
const uint8_t yFlagP = flagField->registerFlag(FlagUID("yP"));
const uint8_t yFlagM = flagField->registerFlag(FlagUID("yM"));
const uint8_t zFlagP = flagField->registerFlag(FlagUID("zP"));
const uint8_t zFlagM = flagField->registerFlag(FlagUID("zM"));
const uint8_t streamFlag = flagField->registerFlag(FlagUID("stream"));
for (auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir) {
const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) {
CellInterval ci;
flagField->getSliceBeforeGhostLayer(*dir, ci);
for (auto cell: ci){flagField->addFlag(cell, baseFlag);}
continue;
}
if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx) && block->neighborExistsLocally( neighborIdx, uint_t(0) )) {
CellInterval ci;
flagField->getSliceBeforeGhostLayer(*dir, ci);
for (auto cell: ci){
if(stencil::cx[*dir] == 1) {flagField->addFlag(cell, xFlagP);}
if(stencil::cx[*dir] == -1) {flagField->addFlag(cell, xFlagM);}
if(stencil::cy[*dir] == 1) {flagField->addFlag(cell, yFlagP);}
if(stencil::cy[*dir] == -1) {flagField->addFlag(cell, yFlagM);}
if(stencil::cz[*dir] == 1) {flagField->addFlag(cell, zFlagP);}
if(stencil::cz[*dir] == -1) {flagField->addFlag(cell, zFlagM);}
}
continue;
}
// Propagate on ghost layers shadowing coarse or no blocks
if(block->neighborhoodSectionHasLargerBlock(neighborIdx)){
CellInterval ci;
flagField->getGhostRegion(*dir, ci, 1);
for (auto cell: ci){flagField->addFlag(cell, streamFlag);}
continue;
}
CellInterval ci;
flagField->getSliceBeforeGhostLayer(*dir, ci);
for (auto cell: ci){flagField->addFlag(cell, baseFlag);}
}
}
for( auto it = blocks->begin(); it != blocks->end(); ++it )
{
auto* local = dynamic_cast< Block* >(it.get());
auto pdfs_tmp = local->getData< PdfField_T >(ids_.pdfField)->cloneUninitialized();
cache_pdfs_[local] = pdfs_tmp;
}
flagPointers.resize(blocks->getNumberOfLevels());
pdfsPointers.resize(blocks->getNumberOfLevels());
pdfs_tmpPointers.resize(blocks->getNumberOfLevels());
for( auto it = blocks->begin(); it != blocks->end(); ++it )
{
auto* local = dynamic_cast< Block* >(it.get());
const uint_t level = local->getLevel();
for (cell_idx_t ctr_2 = -1; ctr_2 < 2; ++ctr_2){
for (cell_idx_t ctr_1 = -1; ctr_1 < 2; ++ctr_1){
for (cell_idx_t ctr_0 = -1; ctr_0 < 2; ++ctr_0){
auto dir = stencil::vectorToDirection(ctr_0, ctr_1, ctr_2);
if(dir == stencil::C)
{
auto flag = local->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
flagPointers[level].emplace_back(flag->dataAt(0, 0, 0, 0));
auto pdfs = local->getData< field::GhostLayerField<double, 27> >(pdfsID);
pdfsPointers[level].emplace_back(pdfs->dataAt(0, 0, 0, 0));
auto pdfs_tmp = cache_pdfs_[local];
pdfs_tmpPointers[level].emplace_back(pdfs_tmp->dataAt(0, 0, 0, 0));
_size_flag_0 = int64_t(int64_c(flag->xSize()));
_size_flag_1 = int64_t(int64_c(flag->ySize()));
_size_flag_2 = int64_t(int64_c(flag->zSize()));
_stride_flag_0 = int64_t(flag->xStride());
_stride_flag_1 = int64_t(flag->yStride());
_stride_flag_2 = int64_t(flag->zStride());
_stride_flag_3 = int64_t(1 * int64_t(flag->fStride()));
_stride_pdfs_0 = int64_t(pdfs->xStride());
_stride_pdfs_1 = int64_t(pdfs->yStride());
_stride_pdfs_2 = int64_t(pdfs->zStride());
_stride_pdfs_3 = int64_t(1 * int64_t(pdfs->fStride()));
_stride_pdfs_tmp_0 = int64_t(pdfs_tmp->xStride());
_stride_pdfs_tmp_1 = int64_t(pdfs_tmp->yStride());
_stride_pdfs_tmp_2 = int64_t(pdfs_tmp->zStride());
_stride_pdfs_tmp_3 = int64_t(1 * int64_t(pdfs_tmp->fStride()));
}
else{
const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(dir);
if (local->neighborhoodSectionHasEquallySizedBlock(neighborIdx) && local->neighborExistsLocally( neighborIdx, uint_t(0) )) {
const BlockID& neighborId = local->getNeighborId(neighborIdx, uint_t(0));
auto neighbor = dynamic_cast< Block* >(blocks->getBlock(neighborId));
auto flag = neighbor->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
auto pdfs = neighbor->getData< field::GhostLayerField<double, 27> >(pdfsID);
auto pdfs_tmp = cache_pdfs_[neighbor];
const cell_idx_t x = (stencil::cx[dir] == -1) ? cell_idx_c(pdfs->xSize() - 1) : 0;
const cell_idx_t y = (stencil::cy[dir] == -1) ? cell_idx_c(pdfs->ySize() - 1) : 0;
const cell_idx_t z = (stencil::cz[dir] == -1) ? cell_idx_c(pdfs->zSize() - 1) : 0;
flagPointers[level].emplace_back(flag->dataAt(x, y, z, 0));
pdfsPointers[level].emplace_back(pdfs->dataAt(x, y, z, 0));
pdfs_tmpPointers[level].emplace_back(pdfs_tmp->dataAt(x, y, z, 0));
}
else{
auto flag = local->getData< field::GhostLayerField<uint8_t, 1> >(flagID);
flagPointers[level].emplace_back(flag->dataAt(0, 0, 0, 0));
auto pdfs = local->getData< field::GhostLayerField<double, 27> >(pdfsID);
pdfsPointers[level].emplace_back(pdfs->dataAt(0, 0, 0, 0));
pdfs_tmpPointers[level].emplace_back(cache_pdfs_[local]->dataAt(0, 0, 0, 0));
}
}
}
}
}
}
}
void streamCollideEven()
{
uint8_t ** RESTRICT const _data_flag_dp = flagPointers[0].data();
double ** RESTRICT const _data_pdfs_dp = pdfsPointers[0].data();
double ** RESTRICT _data_pdfs_tmp_dp = pdfs_tmpPointers[0].data();
double omega = omegaVector[0];
const double xi_0 = ((1.0) / (omega*-0.25 + 2.0));
const double rr_0 = xi_0*(omega*-2.0 + 4.0);
for(uint_t index = 0; index < _nBlocks; ++index)
{
const uint8_t * RESTRICT _data_flag = ((uint8_t * RESTRICT const)(_data_flag_dp[27*index + 13]));
const double * RESTRICT _data_pdfs = ((double * RESTRICT const)(_data_pdfs_dp[27*index + 13]));
double * RESTRICT _data_pdfs_tmp = ((double * RESTRICT )(_data_pdfs_tmp_dp[27*index + 13]));
for (int64_t ctr_2 = 0; ctr_2 < _size_flag_2; ctr_2 += 1)
{
for (int64_t ctr_1 = 0; ctr_1 < _size_flag_1; ctr_1 += 1)
{
for (int64_t ctr_0 = 0; ctr_0 < _size_flag_0; ctr_0 += 1)
{
const uint_t index_C = uint_c(27) * index + uint_c( (0 + 1) * 9 + (0 + 1) * 3 + 0 + 1 );
const uint8_t flag = _data_flag[_stride_flag_0*ctr_0 + _stride_flag_1*ctr_1 + _stride_flag_2*ctr_2];
const uint64_t index_N = ((uint64_t)(index*27 + uint64_c( (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 )));
const uint64_t index_S = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1));
const uint64_t index_W = ((uint64_t)(index*27 + 1*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_E = ((uint64_t)(index*27 + 1*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_T = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1));
const uint64_t index_B = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1));
const uint64_t index_NW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_NE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_SW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_SE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + 1*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_TN = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1));
const uint64_t index_TS = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1));
const uint64_t index_TW = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_TE = ((uint64_t)(index*27 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1*3 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_BN = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1));
const uint64_t index_BS = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1));
const uint64_t index_BW = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_BE = ((uint64_t)(index*27 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1*3 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_TNE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_TNW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_TSE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_TSW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (6) & 1) * (-1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_BNE = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_BNW = ((uint64_t)(index*27 + (1 + ((flag) >> (4) & 1) * (-1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
const uint64_t index_BSE = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (2) & 1) * (-1)));
const uint64_t index_BSW = ((uint64_t)(index*27 + (1 + ((flag) >> (3) & 1) * (1))*3 + (1 + ((flag) >> (5) & 1) * (1))*9 + 1 + ((flag) >> (1) & 1) * (1)));
const double * RESTRICT _data_N = _data_pdfs_dp[index_N];
const double * RESTRICT _data_S = ((double * RESTRICT const)(_data_pdfs_dp[index_S]));
const double * RESTRICT _data_W = ((double * RESTRICT const)(_data_pdfs_dp[index_W]));
const double * RESTRICT _data_E = ((double * RESTRICT const)(_data_pdfs_dp[index_E]));
const double * RESTRICT _data_T = ((double * RESTRICT const)(_data_pdfs_dp[index_T]));
const double * RESTRICT _data_B = ((double * RESTRICT const)(_data_pdfs_dp[index_B]));
const double * RESTRICT _data_NW = ((double * RESTRICT const)(_data_pdfs_dp[index_NW]));
const double * RESTRICT _data_NE = ((double * RESTRICT const)(_data_pdfs_dp[index_NE]));
const double * RESTRICT _data_SW = ((double * RESTRICT const)(_data_pdfs_dp[index_SW]));
const double * RESTRICT _data_SE = ((double * RESTRICT const)(_data_pdfs_dp[index_SE]));
const double * RESTRICT _data_TN = ((double * RESTRICT const)(_data_pdfs_dp[index_TN]));
const double * RESTRICT _data_TS = ((double * RESTRICT const)(_data_pdfs_dp[index_TS]));
const double * RESTRICT _data_TW = ((double * RESTRICT const)(_data_pdfs_dp[index_TW]));
const double * RESTRICT _data_TE = ((double * RESTRICT const)(_data_pdfs_dp[index_TE]));
const double * RESTRICT _data_BN = ((double * RESTRICT const)(_data_pdfs_dp[index_BN]));
const double * RESTRICT _data_BS = ((double * RESTRICT const)(_data_pdfs_dp[index_BS]));
const double * RESTRICT _data_BW = ((double * RESTRICT const)(_data_pdfs_dp[index_BW]));
const double * RESTRICT _data_BE = ((double * RESTRICT const)(_data_pdfs_dp[index_BE]));
const double * RESTRICT _data_TNE = ((double * RESTRICT const)(_data_pdfs_dp[index_TNE]));
const double * RESTRICT _data_TNW = ((double * RESTRICT const)(_data_pdfs_dp[index_TNW]));
const double * RESTRICT _data_TSE = ((double * RESTRICT const)(_data_pdfs_dp[index_TSE]));
const double * RESTRICT _data_TSW = ((double * RESTRICT const)(_data_pdfs_dp[index_TSW]));
const double * RESTRICT _data_BNE = ((double * RESTRICT const)(_data_pdfs_dp[index_BNE]));
const double * RESTRICT _data_BNW = ((double * RESTRICT const)(_data_pdfs_dp[index_BNW]));
const double * RESTRICT _data_BSE = ((double * RESTRICT const)(_data_pdfs_dp[index_BSE]));
const double * RESTRICT _data_BSW = ((double * RESTRICT const)(_data_pdfs_dp[index_BSW]));
const double f_0 = _data_pdfs[ctr_0*_stride_pdfs_0 + ctr_1*_stride_pdfs_1 + ctr_2*_stride_pdfs_2 + 0*_stride_pdfs_3];
const double f_1 = _data_N[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + _stride_pdfs_3];
const double f_2 = _data_S[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 2*_stride_pdfs_3];
const double f_3 = _data_W[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 3*_stride_pdfs_3];
const double f_4 = _data_E[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*ctr_2 + 4*_stride_pdfs_3];
const double f_5 = _data_T[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 5*_stride_pdfs_3];
const double f_6 = _data_B[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 6*_stride_pdfs_3];
const double f_7 = _data_NW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + 7*_stride_pdfs_3];
const double f_8 = _data_NE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*ctr_2 + 8*_stride_pdfs_3];
const double f_9 = _data_SW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 9*_stride_pdfs_3];
const double f_10 = _data_SE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*ctr_2 + 10*_stride_pdfs_3];
const double f_11 = _data_TN[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 11*_stride_pdfs_3];
const double f_12 = _data_TS[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 12*_stride_pdfs_3];
const double f_13 = _data_TW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 13*_stride_pdfs_3];
const double f_14 = _data_TE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 14*_stride_pdfs_3];
const double f_15 = _data_BN[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 15*_stride_pdfs_3];
const double f_16 = _data_BS[_stride_pdfs_0*ctr_0 + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 16*_stride_pdfs_3];
const double f_17 = _data_BW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 17*_stride_pdfs_3];
const double f_18 = _data_BE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*ctr_1 + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 18*_stride_pdfs_3];
const double f_19 = _data_TNE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 19*_stride_pdfs_3];
const double f_20 = _data_TNW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 20*_stride_pdfs_3];
const double f_21 = _data_TSE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 21*_stride_pdfs_3];
const double f_22 = _data_TSW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (6) & 1) * (1 - ctr_2) - 1) + 22*_stride_pdfs_3];
const double f_23 = _data_BNE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 23*_stride_pdfs_3];
const double f_24 = _data_BNW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (4) & 1) * (1 - ctr_1) - 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 24*_stride_pdfs_3];
const double f_25 = _data_BSE[_stride_pdfs_0*(ctr_0 + ((flag) >> (2) & 1) * (1 - ctr_0) - 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 25*_stride_pdfs_3];
const double f_26 = _data_BSW[_stride_pdfs_0*(ctr_0 + ((flag) >> (1) & 1) * (-ctr_0 - 1) + 1) + _stride_pdfs_1*(ctr_1 + ((flag) >> (3) & 1) * (-ctr_1 - 1) + 1) + _stride_pdfs_2*(ctr_2 + ((flag) >> (5) & 1) * (-ctr_2 - 1) + 1) + 26*_stride_pdfs_3];
const double vel0Term = f_10 + f_14 + f_18 + f_19 + f_21 + f_23 + f_25 + f_4 + f_8;
const double vel1Term = f_1 + f_11 + f_15 + f_20 + f_24 + f_7;
const double vel2Term = f_12 + f_13 + f_22 + f_5;
const double delta_rho = f_0 + f_16 + f_17 + f_2 + f_26 + f_3 + f_6 + f_9 + vel0Term + vel1Term + vel2Term;
const double rho = delta_rho + 1.0;
const double xi_1 = ((1.0) / (rho));
const double u_0 = xi_1*(-f_13 - f_17 - f_20 - f_22 - f_24 - f_26 - f_3 - f_7 - f_9 + vel0Term);
const double u_1 = xi_1*(-f_10 - f_12 - f_16 + f_19 - f_2 - f_21 - f_22 + f_23 - f_25 - f_26 + f_8 - f_9 + vel1Term);
const double u_2 = xi_1*(f_11 + f_14 - f_15 - f_16 - f_17 - f_18 + f_19 + f_20 + f_21 - f_23 - f_24 - f_25 - f_26 - f_6 + vel2Term);
const double u0Mu1 = u_0 - u_1;
const double u0Pu1 = u_0 + u_1;
const double u1Pu2 = u_1 + u_2;
const double u1Mu2 = u_1 - u_2;
const double u0Mu2 = u_0 - u_2;
const double u0Pu2 = u_0 + u_2;
const double f_eq_common = delta_rho + rho*-1.5*(u_0*u_0) + rho*-1.5*(u_1*u_1) + rho*-1.5*(u_2*u_2);
const double r_0 = f_0 + omega*(-f_0 + f_eq_common*0.29629629629629628);
const double r_1 = f_1 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*-0.5 + f_2*0.5 + rho*u_1*0.22222222222222221);
const double r_2 = f_2 + omega*(f_1*-0.5 + f_2*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_1*u_1)) + rr_0*(f_1*0.5 + f_2*-0.5 + rho*u_1*-0.22222222222222221);
const double r_3 = f_3 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*-0.5 + f_4*0.5 + rho*u_0*-0.22222222222222221);
const double r_4 = f_4 + omega*(f_3*-0.5 + f_4*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_0*u_0)) + rr_0*(f_3*0.5 + f_4*-0.5 + rho*u_0*0.22222222222222221);
const double r_5 = f_5 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*-0.5 + f_6*0.5 + rho*u_2*0.22222222222222221);
const double r_6 = f_6 + omega*(f_5*-0.5 + f_6*-0.5 + f_eq_common*0.07407407407407407 + rho*0.33333333333333331*(u_2*u_2)) + rr_0*(f_5*0.5 + f_6*-0.5 + rho*u_2*-0.22222222222222221);
const double r_7 = f_7 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*0.5 + f_7*-0.5 + rho*u0Mu1*-0.055555555555555552);
const double r_8 = f_8 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*-0.5 + f_9*0.5 + rho*u0Pu1*0.055555555555555552);
const double r_9 = f_9 + omega*(f_8*-0.5 + f_9*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu1*u0Pu1)) + rr_0*(f_8*0.5 + f_9*-0.5 + rho*u0Pu1*-0.055555555555555552);
const double r_10 = f_10 + omega*(f_10*-0.5 + f_7*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu1*u0Mu1)) + rr_0*(f_10*-0.5 + f_7*0.5 + rho*u0Mu1*0.055555555555555552);
const double r_11 = f_11 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*-0.5 + f_16*0.5 + rho*u1Pu2*0.055555555555555552);
const double r_12 = f_12 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*-0.5 + f_15*0.5 + rho*u1Mu2*-0.055555555555555552);
const double r_13 = f_13 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*-0.5 + f_18*0.5 + rho*u0Mu2*-0.055555555555555552);
const double r_14 = f_14 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*-0.5 + f_17*0.5 + rho*u0Pu2*0.055555555555555552);
const double r_15 = f_15 + omega*(f_12*-0.5 + f_15*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Mu2*u1Mu2)) + rr_0*(f_12*0.5 + f_15*-0.5 + rho*u1Mu2*0.055555555555555552);
const double r_16 = f_16 + omega*(f_11*-0.5 + f_16*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u1Pu2*u1Pu2)) + rr_0*(f_11*0.5 + f_16*-0.5 + rho*u1Pu2*-0.055555555555555552);
const double r_17 = f_17 + omega*(f_14*-0.5 + f_17*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Pu2*u0Pu2)) + rr_0*(f_14*0.5 + f_17*-0.5 + rho*u0Pu2*-0.055555555555555552);
const double r_18 = f_18 + omega*(f_13*-0.5 + f_18*-0.5 + f_eq_common*0.018518518518518517 + rho*0.083333333333333329*(u0Mu2*u0Mu2)) + rr_0*(f_13*0.5 + f_18*-0.5 + rho*u0Mu2*0.055555555555555552);
const double r_19 = f_19 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*-0.5 + f_26*0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*0.013888888888888888));
const double r_20 = f_20 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*-0.5 + f_25*0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*0.013888888888888888));
const double r_21 = f_21 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*-0.5 + f_24*0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*0.013888888888888888));
const double r_22 = f_22 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*-0.5 + f_23*0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*0.013888888888888888));
const double r_23 = f_23 + omega*(delta_rho*-0.013888888888888888 + f_22*-0.5 + f_23*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_22*0.5 + f_23*-0.5 + rho*(u0Pu1*0.013888888888888888 + u_2*-0.013888888888888888));
const double r_24 = f_24 + omega*(delta_rho*-0.013888888888888888 + f_21*-0.5 + f_24*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Mu2*u1Mu2))) + rr_0*(f_21*0.5 + f_24*-0.5 + rho*(u0Mu1*-0.013888888888888888 + u_2*-0.013888888888888888));
const double r_25 = f_25 + omega*(delta_rho*-0.013888888888888888 + f_20*-0.5 + f_25*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Mu1*u0Mu1) + 0.020833333333333332*(u0Mu2*u0Mu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_20*0.5 + f_25*-0.5 + rho*(u0Mu1*0.013888888888888888 + u_2*-0.013888888888888888));
const double r_26 = f_26 + omega*(delta_rho*-0.013888888888888888 + f_19*-0.5 + f_26*-0.5 + f_eq_common*0.018518518518518517 + rho*(0.020833333333333332*(u0Pu1*u0Pu1) + 0.020833333333333332*(u0Pu2*u0Pu2) + 0.020833333333333332*(u1Pu2*u1Pu2))) + rr_0*(f_19*0.5 + f_26*-0.5 + rho*(u0Pu1*-0.013888888888888888 + u_2*-0.013888888888888888));
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 0*_stride_pdfs_tmp_3] = r_0;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 1*_stride_pdfs_tmp_3] = r_1;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 2*_stride_pdfs_tmp_3] = r_2;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 3*_stride_pdfs_tmp_3] = r_3;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 4*_stride_pdfs_tmp_3] = r_4;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 5*_stride_pdfs_tmp_3] = r_5;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 6*_stride_pdfs_tmp_3] = r_6;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 7*_stride_pdfs_tmp_3] = r_7;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 8*_stride_pdfs_tmp_3] = r_8;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 9*_stride_pdfs_tmp_3] = r_9;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 10*_stride_pdfs_tmp_3] = r_10;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 11*_stride_pdfs_tmp_3] = r_11;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 12*_stride_pdfs_tmp_3] = r_12;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 13*_stride_pdfs_tmp_3] = r_13;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 14*_stride_pdfs_tmp_3] = r_14;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 15*_stride_pdfs_tmp_3] = r_15;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 16*_stride_pdfs_tmp_3] = r_16;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 17*_stride_pdfs_tmp_3] = r_17;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 18*_stride_pdfs_tmp_3] = r_18;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 19*_stride_pdfs_tmp_3] = r_19;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 20*_stride_pdfs_tmp_3] = r_20;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 21*_stride_pdfs_tmp_3] = r_21;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 22*_stride_pdfs_tmp_3] = r_22;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 23*_stride_pdfs_tmp_3] = r_23;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 24*_stride_pdfs_tmp_3] = r_24;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 25*_stride_pdfs_tmp_3] = r_25;
_data_pdfs_tmp[ctr_0*_stride_pdfs_tmp_0 + ctr_1*_stride_pdfs_tmp_1 + ctr_2*_stride_pdfs_tmp_2 + 26*_stride_pdfs_tmp_3] = r_26;
}
}
}
}
}
void swap()
{
for( auto block: blockVector )
{
PdfField_T * srcField = block->getData< PdfField_T >( ids_.pdfField );
auto dstField = cache_pdfs_[block];
srcField->swapDataPointers(dstField);
}
std::swap(pdfsPointers[0], pdfs_tmpPointers[0]);
}
private:
IDs ids_;
std::vector<Block*> blockVector;
std::vector<real_t> omegaVector;
std::vector<std::vector<uint8_t *>> flagPointers;
std::vector<std::vector<double *>> pdfsPointers;
std::vector<std::vector<double *>> pdfs_tmpPointers;
std::unordered_map<IBlock*, field::GhostLayerField<double, 27> *> cache_pdfs_;
int64_t _size_flag_0;
int64_t _size_flag_1;
int64_t _size_flag_2;
uint_t _nBlocks;
int64_t _stride_pdfs_0;
int64_t _stride_pdfs_1;
int64_t _stride_pdfs_2;
int64_t _stride_pdfs_3;
int64_t _stride_flag_0;
int64_t _stride_flag_1;
int64_t _stride_flag_2;
int64_t _stride_flag_3;
int64_t _stride_pdfs_tmp_0;
int64_t _stride_pdfs_tmp_1;
int64_t _stride_pdfs_tmp_2;
int64_t _stride_pdfs_tmp_3;
};
class Timestep
{
public:
Timestep(std::shared_ptr< StructuredBlockForest >& blocks, const IDs& ids,
StreamCollide & streamCollide, std::shared_ptr< PackInfo_T >& packInfo, UniformBufferedScheme<CommunicationStencil_T>& communication)
: blocks_(blocks), ids_(ids), streamCollide_(streamCollide), packInfo_(packInfo), communication_(communication)
{
}
void commLocal(const uint_t level)
{
for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
{
if(blocks_->getLevel(*it.get()) != level)
continue;
auto* sender = dynamic_cast< Block* >(it.get());
for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir )
{
const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
if (!(sender->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
continue;
const BlockID & receiverId = sender->getNeighborId( neighborIdx, uint_t(0) );
auto receiver = dynamic_cast< Block * >( blocks_->getBlock(receiverId) );
const PdfField_T * srcField = sender->getData< PdfField_T >( ids_.pdfField );
PdfField_T * dstField = receiver->getData< PdfField_T >( ids_.pdfField );
CellInterval srcRegion;
CellInterval dstRegion;
cell_idx_t gls = 1;
srcField->getSliceBeforeGhostLayer(*dir, srcRegion, gls, false);
dstField->getGhostRegion(stencil::inverseDir[*dir], dstRegion, gls, false);
WALBERLA_ASSERT_EQUAL(srcRegion.xSize(), dstRegion.xSize())
WALBERLA_ASSERT_EQUAL(srcRegion.ySize(), dstRegion.ySize())
WALBERLA_ASSERT_EQUAL(srcRegion.zSize(), dstRegion.zSize())
//WALBERLA_LOG_DEVEL_VAR(srcRegion)
//WALBERLA_LOG_DEVEL_VAR(dstRegion)
auto srcIter = srcRegion.begin();
auto dstIter = dstRegion.begin();
while (srcIter != srcRegion.end())
{
for( uint_t f = 0; f < Stencil_T::d_per_d_length[*dir]; ++f )
{
dstField->get(*dstIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]) = srcField->get(*srcIter, Stencil_T::idx[ Stencil_T::d_per_d[*dir][f] ]);
}
++srcIter;
++dstIter;
}
WALBERLA_ASSERT( srcIter == srcRegion.end() )
WALBERLA_ASSERT( dstIter == dstRegion.end() )
}
}
}
void swap()
{
streamCollide_.swap();
}
void timestep()
{
// for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
// {
// auto block = dynamic_cast< Block* >(it.get());
// commLocal(0);
// streamCollide_.streamCollide(block);
// // streamCollide_(block);
// }
// commLocal(0);
if(((timestepCounter_) &1) ^ 1)
{
streamCollide_.streamCollideEven();
swap();
}
else
{
streamCollide_.streamCollideEven();
swap();
}
timestepCounter_ = (timestepCounter_ + 1) & 1;
}
void operator()(){ timestep(); };
private:
std::shared_ptr< StructuredBlockForest > blocks_;
IDs ids_;
StreamCollide & streamCollide_;
std::shared_ptr< PackInfo_T > packInfo_;
UniformBufferedScheme<CommunicationStencil_T> communication_;
uint8_t timestepCounter_{0};
};
int main(int argc, char** argv)
{
mpi::Environment env(argc, argv);
shared_ptr< Config > config = make_shared< Config >();
config->readParameterFile(argv[1]);
logging::configureLogging(config);
// read parameters
auto parameters = config->getOneBlock("Parameters");
auto loggingParameters = config->getOneBlock("Logging");
auto domainParameters = config->getOneBlock("DomainSetup");
// auto EvaluationParameters = config->getOneBlock("Evaluation");
const real_t omega = parameters.getParameter< real_t >("omega");
const real_t maxLatticeVelocity = parameters.getParameter< real_t >("maxLatticeVelocity");
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps") + uint_c(1);
Setup setup;
setup.blocks = domainParameters.getParameter< Vector3< uint_t > >("blocks");
setup.cellsPerBlock = domainParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
setup.cells = Vector3< uint_t >(setup.blocks[0] * setup.cellsPerBlock[0], setup.blocks[1] * setup.cellsPerBlock[1],
setup.blocks[2] * setup.cellsPerBlock[2]);
setup.periodic = domainParameters.getParameter< Vector3< bool > >("periodic");
setup.numGhostLayers = uint_c(1);
const uint_t valuesPerCell = (uint_c(2) * Stencil_T::Q + VectorField_T ::F_SIZE + uint_c(2) * ScalarField_T::F_SIZE);
const uint_t sizePerValue = sizeof(PdfField_T::value_type);
const memory_t memoryPerCell = memory_t(valuesPerCell * sizePerValue + uint_c(1));
bool writeSetupForestAndReturn = loggingParameters.getParameter< bool >("writeSetupForestAndReturn", false);
if (uint_c(MPIManager::instance()->numProcesses()) > 1) writeSetupForestAndReturn = false;
if (writeSetupForestAndReturn)
{
std::string sbffile = "TaylorGreenVortex.bfs";
std::ostringstream infoString;
infoString << "You have selected the option of just creating the block structure (= domain decomposition) and "
"saving the result to file\n"
"by specifying the output file name \'"
<< sbffile << "\' AND also specifying \'saveToFile\'.\n";
if (MPIManager::instance()->numProcesses() > 1)
WALBERLA_ABORT(infoString.str() << "In this mode you need to start " << argv[0] << " with just one process!")
WALBERLA_LOG_INFO_ON_ROOT(infoString.str() << "Creating the block structure ...")
const uint_t numberProcesses = domainParameters.getParameter< uint_t >("numberProcesses");
shared_ptr< SetupBlockForest > sforest = createSetupBlockForest(setup, numberProcesses, memoryPerCell);
sforest->writeVTKOutput("domain_decomposition");
sforest->saveToFile(sbffile.c_str());
logging::Logging::printFooterOnStream();
return EXIT_SUCCESS;
}
auto blocks = createStructuredBlockForest(setup, memoryPerCell);
IDs ids;
const StorageSpecification_T StorageSpec = StorageSpecification_T();
ids.pdfField = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, setup.numGhostLayers, field::fzyx);
// ids.pdfFieldTmp = lbm_generated::addPdfFieldToStorage(blocks, "pdfs tmp", StorageSpec, setup.numGhostLayers, field::fzyx);
ids.velocityField = field::addToStorage< VectorField_T >(blocks, "vel", real_c(0.0), field::fzyx, setup.numGhostLayers);
ids.densityField = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, setup.numGhostLayers);
ids.flagField = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
const cell_idx_t domainHalf = cell_idx_c(blocks->getDomain().yMax() / real_c(2.0));
for (auto& block : *blocks)
{
auto velocityField = block.getData<VectorField_T>(ids.velocityField);
for (cell_idx_t ctr_2 = 0; ctr_2 < cell_idx_c(velocityField->zSize()); ++ctr_2)
{
for(cell_idx_t ctr_1 = 0; ctr_1 < cell_idx_c(velocityField->ySize()); ++ctr_1)
{
for (cell_idx_t ctr_0 = 0; ctr_0 < cell_idx_c(velocityField->xSize()); ++ctr_0)
{
Cell domainCell(ctr_0, ctr_1, ctr_2);
blocks->transformBlockLocalToGlobalCell(domainCell, block);
if((domainCell.y() + real_c(0.5)) > domainHalf)
{
velocityField->get(ctr_0, ctr_1, ctr_2, 0) = maxLatticeVelocity;
}
else
{
velocityField->get(ctr_0, ctr_1, ctr_2, 0) = -maxLatticeVelocity;
}
}
}
}
}
SweepCollection_T sweepCollection(blocks, ids.pdfField, ids.densityField, ids.velocityField, omega);
for (auto& block : *blocks)
{
sweepCollection.initialise(&block, cell_idx_c(setup.numGhostLayers - uint_c(1)));
}
for (auto& block : *blocks)
{
if(blocks->getLevel(block) == 0)
{
sweepCollection.streamCollide(&block);
}
}
sweepCollection.initialiseBlockPointer();
//
// sweepCollection.swap(0);
const FlagUID fluidFlagUID("Fluid");
auto boundariesConfig = config->getBlock("Boundaries");
geometry::initBoundaryHandling< FlagField_T >(*blocks, ids.flagField, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, ids.flagField, fluidFlagUID, cell_idx_c(0));
// BoundaryCollection_T boundaryCollection(blocks, ids.flagField, ids.pdfField, fluidFlagUID, maxLatticeVelocity);
WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
std::shared_ptr< PackInfo_T > packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(ids.pdfField);
UniformBufferedScheme< Stencil_T > communication(blocks);
communication.addPackInfo(packInfo);
SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
auto VTKWriter = config->getOneBlock("VTKWriter");
const uint_t vtkWriteFrequency = VTKWriter.getParameter< uint_t >("vtkWriteFrequency");
const bool writeVelocity = VTKWriter.getParameter< bool >("velocity");
const bool writeDensity = VTKWriter.getParameter< bool >("density");
// const bool writeAverageFields = VTKWriter.getParameter< bool >("averageFields", false);
const bool writeFlag = VTKWriter.getParameter< bool >("flag");
// const bool writeOnlySlice = VTKWriter.getParameter< bool >("writeOnlySlice", true);
const bool amrFileFormat = VTKWriter.getParameter< bool >("amrFileFormat", false);
const bool oneFilePerProcess = VTKWriter.getParameter< bool >("oneFilePerProcess", false);
if (vtkWriteFrequency > 0)
{
auto vtkOutput =
vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_TaylorGreenVortex", "simulation_step",
false, true, true, false, 0, amrFileFormat, oneFilePerProcess);
vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks)
{
sweepCollection.calculateMacroscopicParameters(&block);
}
});
if (writeVelocity)
{
auto velWriter = make_shared< field::VTKWriter< VectorField_T, float32 > >(ids.velocityField, "velocity");
vtkOutput->addCellDataWriter(velWriter);
}
if (writeDensity)
{
auto densityWriter = make_shared< field::VTKWriter< ScalarField_T, float32 > >(ids.densityField, "density");
vtkOutput->addCellDataWriter(densityWriter);
}
if (writeFlag)
{
auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(ids.flagField, "flag");
vtkOutput->addCellDataWriter(flagWriter);
}
timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
StreamCollide streamCollide(blocks, ids, omega);
Timestep timeloopFunction(blocks, ids, streamCollide, packInfo, communication);
const bool generatedVersion = false;
if(generatedVersion){
timeloop.addFuncBeforeTimeStep(sweepCollection.blockStreamCollideFck(0), "Refinement Cycle");
}
else{
timeloop.addFuncBeforeTimeStep(timeloopFunction, "Refinement Cycle");
}
const real_t remainingTimeLoggerFrequency = loggingParameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0); // in seconds
if (uint_c(remainingTimeLoggerFrequency) > 0)
{
timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), "remaining time logger");
}
// const uint_t evaluationCheckFrequency = EvaluationParameters.getParameter< uint_t >("evaluationCheckFrequency");
// const bool evaluationLogToStream = EvaluationParameters.getParameter< bool >("logToStream");
// const bool evaluationLogToFile = EvaluationParameters.getParameter< bool >("logToFile");
// const std::string evaluationFilename = EvaluationParameters.getParameter< std::string >("filename");
// std::shared_ptr< AccuracyEvaluation > evaluation = std::make_shared< AccuracyEvaluation >(blocks, ids, sweepCollection, maxLatticeVelocity, evaluationCheckFrequency, evaluationLogToStream, evaluationLogToFile);
// timeloop.addFuncBeforeTimeStep(SharedFunctor< AccuracyEvaluation >(evaluation), "evaluation");
//////////////////////
/// RUN SIMULATION ///
//////////////////////
const lbm_generated::PerformanceEvaluation< FlagField_T > performance(blocks, ids.flagField, fluidFlagUID);
field::CellCounter< FlagField_T > fluidCells(blocks, ids.flagField, fluidFlagUID);
fluidCells();
WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
WALBERLA_LOG_INFO_ON_ROOT("Starting Simulation")
WcTimingPool timeloopTiming;
WcTimer simTimer;
WALBERLA_MPI_BARRIER()
#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
#endif
simTimer.start();
timeloop.run(timeloopTiming);
#if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
#endif
WALBERLA_MPI_BARRIER()
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
real_t time = simTimer.max();
WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
performance.logResultOnRoot(timesteps, time);
const auto reducedTimeloopTiming = timeloopTiming.getReduced();
WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
printResidentMemoryStatistics();
return EXIT_SUCCESS;
}
Parameters
{
omega 1.4;
timesteps 100;
maxLatticeVelocity 0.01;
}
DomainSetup
{
blocks < 4, 4, 4 >;
cellsPerBlock < 8, 8, 8 >;
periodic < 1, 1, 1 >;
numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
}
Boundaries
{
Border { direction S; walldistance -1; flag NoSlip; }
Border { direction N; walldistance -1; flag UBB; }
}
VTKWriter
{
vtkWriteFrequency 50;
velocity true;
density true;
averageFields false;
flag false;
writeOnlySlice false;
amrFileFormat false;
oneFilePerProcess false;
}
Logging
{
logLevel info; // info progress detail tracing
writeSetupForestAndReturn false;
remainingTimeLoggerFrequency 3; // in seconds
}
Evaluation
{
evaluationCheckFrequency 100;
logToStream true;
logToFile true;
filename Channel.txt;
}
import sympy as sp
from pystencils import Target
from pystencils import fields
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
from pystencils_walberla import CodeGeneration, generate_info_header
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
import warnings
warnings.filterwarnings("ignore")
with CodeGeneration() as ctx:
target = Target.CPU # Target.GPU if ctx.cuda else Target.CPU
data_type = "float64" if ctx.double_accuracy else "float32"
pdf_dtype = "float64"
streaming_pattern = 'pull'
timesteps = get_timesteps(streaming_pattern)
omega = sp.symbols("omega")
stencil = LBStencil(Stencil.D3Q27)
dim = stencil.D
pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
streaming_pattern=streaming_pattern, compressible=True)
lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
method = create_lb_method(lbm_config=lbm_config)
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip())
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
generate_lbm_package(ctx, name="TaylorGreenVortex",
collision_rule=collision_rule,
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields, data_type=data_type)
generate_info_header(ctx, 'TaylorGreenVortexHeader')
......@@ -6,6 +6,11 @@ import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
try:
import machinestate as ms
except ImportError:
ms = None
def num_time_steps(block_size, time_steps_for_256_block=50):
# Number of time steps run for a workload of 256^3 cells per process
......@@ -137,6 +142,14 @@ class Scenario:
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
csv_file = f"thermocapillary_benchmark.csv"
......
......@@ -7,8 +7,10 @@ import sympy as sp
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target
from pystencils.astnodes import LoopOverCoordinate
from pystencils.integer_functions import int_div
from pystencils.stencil import offset_to_direction_string
from pystencils.typing import TypedSymbol
from pystencils.typing import TypedSymbol, BasicType, PointerType, FieldPointerSymbol
from pystencils.stencil import inverse_direction
from pystencils.bit_masks import flag_cond
......@@ -18,7 +20,7 @@ from lbmpy.enums import Stencil
from lbmpy.stencils import LBStencil
from pystencils_walberla.cmake_integration import CodeGenerationContext
from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode
from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode, AbortNode
from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
from pystencils_walberla.utility import config_from_context
......@@ -101,6 +103,23 @@ class PackingKernelsCodegen:
self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)]
self.mask_field = fields(f'mask : uint32 [{self.dim}D]', layout=src_field.layout)
self.block_wise = True
if not self.inplace or not self.config.target == Target.GPU:
self.block_wise = False
self.index = TypedSymbol("index", dtype=BasicType(np.int64))
self.index_shape = TypedSymbol("_size_0", dtype=BasicType(np.int64))
self.src_ptr_type = PointerType(self.src_field.dtype, const=True, restrict=True, double_pointer=True)
self.src_ptr = FieldPointerSymbol(self.src_field.name, self.src_field.dtype, const=True)
self.dst_ptr_type = PointerType(self.dst_field.dtype, const=False, restrict=True, double_pointer=True)
self.dst_ptr = FieldPointerSymbol(self.dst_field.name, self.dst_field.dtype, const=False)
self.data_src = TypedSymbol(f"_data_{self.src_field.name}_dp", dtype=self.src_ptr_type)
self.data_dst = TypedSymbol(f"_data_{self.dst_field.name}_dp", dtype=self.dst_ptr_type)
self.f = sp.IndexedBase(self.data_src, shape=self.index_shape)
self.d = sp.IndexedBase(self.data_dst, shape=self.index_shape)
def create_uniform_kernel_families(self, kernels_dict=None):
kernels = dict() if kernels_dict is None else kernels_dict
......@@ -115,6 +134,8 @@ class PackingKernelsCodegen:
def create_nonuniform_kernel_families(self, kernels_dict=None):
kernels = dict() if kernels_dict is None else kernels_dict
kernels['localCopyRedistribute'] = self.get_local_copy_redistribute_kernel_family()
kernels['localPartialCoalescence'] = self.get_local_copy_partial_coalescence_kernel_family()
kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family()
kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family()
kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family()
......@@ -231,7 +252,10 @@ class PackingKernelsCodegen:
dir_string = offset_to_direction_string(comm_dir)
streaming_dirs = self.get_streaming_dirs(comm_dir)
src, dst = self._stream_out_accs(timestep)
assignments = []
assignments = list()
if self.block_wise:
assignments.append(Assignment(self.src_ptr, self.f[self.index]))
assignments.append(Assignment(self.dst_ptr, self.d[self.index]))
dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
if len(dir_indices) == 0:
return None
......@@ -283,15 +307,59 @@ class PackingKernelsCodegen:
return create_kernel(assignments, config=config)
def get_unpack_redistribute_kernel_family(self):
return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast)
return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast,
exclude_time_step=Timestep.EVEN)
def get_local_copy_redistribute_ast(self, comm_dir, timestep):
# TODO
raise NotImplementedError()
assert not all(d == 0 for d in comm_dir)
ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
dir_string = offset_to_direction_string(comm_dir)
streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir))
dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
if len(dir_indices) == 0:
return None
# for inplace streaming the dst (fine grid) must always be on odd state
dst_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
_, dst = self._stream_out_accs(dst_timestep)
src, _ = self._stream_out_accs(timestep)
src_abs = self.src_field.new_field_with_different_name(self.src_field.name)
src_abs.field_type = FieldType.CUSTOM
orthos = self.orthogonal_principals(comm_dir)
sub_dirs = self.contained_principals(comm_dir)
orthogonal_combinations = self.linear_combinations(orthos)
subdir_combinations = self.linear_combinations_nozero(sub_dirs)
second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)]
negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir])
assignments = []
for offset in orthogonal_combinations:
o = offset + negative_dir_correction
for d in range(self.values_per_cell):
field_acc = dst[d].get_shifted(*o)
src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
for offset in second_gl_dirs:
o = offset + negative_dir_correction
for d in dir_indices:
field_acc = dst[d].get_shifted(*o)
src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)]
assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, ))))
function_name = f'localCopyRedistribute_{dir_string}' + timestep_suffix(timestep)
iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice,
data_type=self.data_type, ghost_layers=0, allow_double_writes=True,
cpu_openmp=self.config.cpu_openmp, target=self.config.target)
return create_kernel(assignments, config=config)
def get_local_copy_redistribute_kernel_family(self):
# TODO
raise NotImplementedError()
return self._construct_directionwise_kernel_family(self.get_local_copy_redistribute_ast)
# --------------------------- Pack / Unpack / LocalCopy Fine to Coarse ---------------------------------------------
......@@ -322,7 +390,8 @@ class PackingKernelsCodegen:
return ast
def get_pack_partial_coalescence_kernel_family(self):
return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast)
return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast,
exclude_time_step=Timestep.ODD)
def get_unpack_coalescence_ast(self, comm_dir, timestep):
config = replace(self.config, ghost_layers=0)
......@@ -370,12 +439,53 @@ class PackingKernelsCodegen:
def get_zero_coalescence_region_kernel_family(self):
return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast)
# TODO
def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep):
raise NotImplementedError()
assert not all(d == 0 for d in comm_dir)
ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)]
dir_string = offset_to_direction_string(comm_dir)
streaming_dirs = self.get_streaming_dirs(comm_dir)
dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs)
if len(dir_indices) == 0:
return None
buffer = sp.symbols(f"b_:{self.values_per_cell}")
# for inplace streaming the src (fine grid) must always be on even state
src_timestep = Timestep.ODD if self.inplace else Timestep.BOTH
src, _ = self._stream_in_accs(src_timestep)
_, dst = self._stream_in_accs(timestep.next())
mask = self.mask_field
dst_abs = self.dst_field.new_field_with_different_name(self.dst_field.name)
dst_abs.field_type = FieldType.CUSTOM
coalescence_factor = sp.Rational(1, 2 ** self.dim)
offsets = list(product(*((0, 1) for _ in comm_dir)))
assignments = []
for i, d in enumerate(dir_indices):
acc = 0
for o in offsets:
acc += flag_cond(d, mask[o], src[d].get_shifted(*o))
assignments.append(Assignment(buffer[i], acc))
for i, d in enumerate(dir_indices):
index = dst[d].index
dst_access = [int_div(ctr[i], 2) + o for i, o in enumerate(dst[d].offsets)]
assignments.append(Assignment(dst_abs.absolute_access(dst_access, index),
dst_abs.absolute_access(dst_access, index) + coalescence_factor * buffer[i]))
iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim))
config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0)
ast = create_kernel(assignments, config=config)
ast.function_name = f'localPartialCoalescence_{dir_string}' + timestep_suffix(timestep)
return ast
def get_local_copy_partial_coalescence_kernel_family(self):
raise NotImplementedError()
return self._construct_directionwise_kernel_family(self.get_local_copy_partial_coalescence_ast)
# ------------------------------------------ Utility ---------------------------------------------------------------
......@@ -425,7 +535,7 @@ class PackingKernelsCodegen:
# --------------------------- Private Members ----------------------------------------------------------------------
def _construct_directionwise_kernel_family(self, create_ast_callback):
def _construct_directionwise_kernel_family(self, create_ast_callback, exclude_time_step=None):
subtrees = []
direction_symbol = TypedSymbol('dir', dtype='stencil::Direction')
for t in get_timesteps(self.streaming_pattern):
......@@ -439,7 +549,10 @@ class PackingKernelsCodegen:
continue
kernel_call = KernelCallNode(ast)
cases_dict[f"stencil::{dir_string}"] = kernel_call
subtrees.append(SwitchNode(direction_symbol, cases_dict))
if exclude_time_step is not None and t == exclude_time_step:
subtrees.append(AbortNode("This function can not be called! Please contact the waLBerla team"))
else:
subtrees.append(SwitchNode(direction_symbol, cases_dict))
if not self.inplace:
tree = subtrees[0]
......
......@@ -113,8 +113,8 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
'kernels': kernels,
'direction_sizes': cg.get_direction_sizes(),
'src_field': cg.src_field,
'dst_field': cg.dst_field
'dst_field': cg.dst_field,
'block_wise': cg.block_wise
}
if nonuniform:
jinja_context['mask_field'] = cg.mask_field
......
from dataclasses import replace
from typing import Dict
from jinja2 import Environment, PackageLoader, StrictUndefined
import sympy as sp
import numpy as np
from pystencils import Target, create_kernel
from pystencils import Target, create_kernel, Assignment
from pystencils.bit_masks import flag_cond
from pystencils.config import CreateKernelConfig
from pystencils.field import Field
from pystencils.field import Field, fields
from pystencils.simp import add_subexpressions_for_field_reads
from pystencils.typing import BasicType, PointerType, FieldPointerSymbol, TypedSymbol, CastFunc
from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep
from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
......@@ -17,8 +21,8 @@ from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily
from pystencils_walberla.utility import config_from_context
from pystencils_walberla import generate_sweep_collection
from lbmpy_walberla.utility import create_pdf_field
from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
from lbmpy_walberla.utility import create_pdf_field, timestep_suffix
from .alternating_sweeps import EvenIntegerCondition
from .function_generator import kernel_family_function_generator
......@@ -28,7 +32,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
refinement_scaling=None, macroscopic_fields: Dict[str, Field] = None,
target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
max_threads=None,
max_threads=None, set_pre_collision_pdfs=True,
**create_kernel_params):
config = config_from_context(ctx, target=target, data_type=data_type,
......@@ -59,7 +63,6 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
field_layout=lbm_optimisation.field_layout)
config = replace(config, ghost_layers=0)
function_generators = []
def family(name):
......@@ -68,6 +71,21 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
def generator(name, kernel_family):
return kernel_family_function_generator(name, kernel_family, namespace='lbm', max_threads=max_threads)
all_fields = collision_rule.bound_fields.union(collision_rule.free_fields)
all_fields.update({src_field, dst_field})
all_fields = list(sorted(all_fields, key=lambda e: str(e)))
swap_pairs = [(x, y) for x in all_fields for y in all_fields if y.name == f"{x.name}_tmp"]
for swap_pair in swap_pairs:
all_fields.remove(swap_pair[1])
bw_stream_collide = None
bw_stream = None
if is_inplace(streaming_pattern):
bw_stream_collide = block_wise_stream_collide(class_name, collision_rule, lbm_config,
src_field, dst_field, config)
bw_stream = block_wise_stream(class_name, collision_rule, lbm_config, src_field, dst_field, config)
function_generators.append(generator('streamCollide', family("streamCollide")))
function_generators.append(generator('collide', family("collide")))
function_generators.append(generator('stream', family("stream")))
......@@ -76,7 +94,7 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
config_unoptimized = replace(config, cpu_vectorize_info=None, cpu_prepend_optimizations=[], cpu_blocking=None)
setter_family = get_setter_family(class_name, lb_method, src_field, streaming_pattern, macroscopic_fields,
config_unoptimized)
config_unoptimized, set_pre_collision_pdfs)
setter_generator = kernel_family_function_generator('initialise', setter_family,
namespace='lbm', max_threads=max_threads)
function_generators.append(setter_generator)
......@@ -87,7 +105,66 @@ def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmColli
namespace='lbm', max_threads=max_threads)
function_generators.append(getter_generator)
generate_sweep_collection(ctx, class_name, function_generators, refinement_scaling)
contexts_function_generators = list()
for fct in function_generators:
contexts_function_generators.append(fct())
namespaces = set([context['namespace'] for context in contexts_function_generators])
assert len(namespaces) == 1, "All function_generators must output the same namespace!"
namespace = namespaces.pop()
headers = set()
for context in contexts_function_generators:
for header in context['interface_spec'].headers:
headers.add(header)
for header in context['kernel'].get_headers():
headers.add(header)
kernel_list = list()
for context in contexts_function_generators:
kernel_list.append(context['kernel'])
kernels = list()
for context in contexts_function_generators:
kernels.append({
'kernel': context['kernel'],
'function_name': context['function_name'],
'ghost_layers_to_include': 'ghost_layers',
'field': context['field'],
'max_threads': context['max_threads']
})
target = kernels[0]['kernel'].target
jinja_context = {
'block_stream_collide': bw_stream_collide,
'block_stream': bw_stream,
'pdf_field': src_field,
'all_fields': all_fields,
'swap_pairs': swap_pairs,
'kernel_list': kernel_list,
'kernels': kernels,
'namespace': namespace,
'class_name': class_name,
'headers': headers,
'target': target.name.lower(),
'is_gpu': target == Target.GPU,
'parameter_scaling': refinement_scaling,
'stencil_name': lbm_config.stencil.name,
'D': lbm_config.stencil.D,
'Q': lbm_config.stencil.Q,
'inplace': is_inplace(lbm_config.streaming_pattern)
}
env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
add_pystencils_filters_to_jinja_env(env)
header = env.get_template("LBMSweepCollection.tmpl.h").render(**jinja_context)
source = env.get_template("LBMSweepCollection.tmpl.cpp").render(**jinja_context)
source_extension = "cu" if target == Target.GPU and ctx.cuda else "cpp"
ctx.write_file(f"{class_name}.h", header)
ctx.write_file(f"{class_name}.{source_extension}", source)
class RefinementScaling:
......@@ -167,14 +244,15 @@ def lbm_kernel_family(class_name, kernel_name,
return family
def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields, config: CreateKernelConfig):
def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopic_fields,
config: CreateKernelConfig, set_pre_collision_pdfs):
dim = lb_method.stencil.D
density = macroscopic_fields.get('density', 1.0)
velocity = macroscopic_fields.get('velocity', [0.0] * dim)
default_dtype = config.data_type.default_factory()
get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
get_timestep = {"field_name": pdfs.name, "function": "getTimestepPlusOne"}
temporary_fields = ()
field_swaps = ()
......@@ -184,7 +262,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
timestep_suffix = str(timestep)
setter = macroscopic_values_setter(lb_method,
density=density, velocity=velocity, pdfs=pdfs,
streaming_pattern=streaming_pattern, previous_timestep=timestep)
streaming_pattern=streaming_pattern, previous_timestep=timestep,
set_pre_collision_pdfs=set_pre_collision_pdfs)
if default_dtype != pdfs.dtype:
setter = add_subexpressions_for_field_reads(setter, data_type=default_dtype)
......@@ -198,7 +277,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
timestep = Timestep.BOTH
setter = macroscopic_values_setter(lb_method,
density=density, velocity=velocity, pdfs=pdfs,
streaming_pattern=streaming_pattern, previous_timestep=timestep)
streaming_pattern=streaming_pattern, previous_timestep=timestep,
set_pre_collision_pdfs=set_pre_collision_pdfs)
setter_ast = create_kernel(setter, config=config)
setter_ast.function_name = 'kernel_initialise'
......@@ -249,3 +329,127 @@ def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
family = KernelFamily(node, class_name, temporary_fields=temporary_fields, field_swaps=field_swaps)
return family
def block_wise_stream_collide(class_name, collision_rule, lbm_config, src_field, dst_field, config):
if not is_inplace(lbm_config.streaming_pattern):
ast, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.BOTH, config, False)
tree = KernelCallNode(ast)
else:
ast_even, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.EVEN, config, False)
even_call = KernelCallNode(ast_even)
ast_odd, _ = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.ODD, config, False)
odd_call = KernelCallNode(ast_odd)
tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
family = KernelFamily(tree, class_name)
indexed_to_field_name = dict()
for field in all_fields:
indexed_to_field_name[field.name] = f"_data_{field.name}_dp"
context = {
'kernel': family,
'all_fields': all_fields,
'namespace': 'lbm',
'function_name': 'blockStreamCollide',
'indexed_to_field_name': indexed_to_field_name,
'max_threads': None
}
return context
def block_wise_stream(class_name, collision_rule, lbm_config, src_field, dst_field, config):
if not is_inplace(lbm_config.streaming_pattern):
ast, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.BOTH, config, True)
tree = KernelCallNode(ast)
else:
ast_even, all_fields = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.EVEN, config, True)
even_call = KernelCallNode(ast_even)
ast_odd, _ = create_block_wise_ast(collision_rule, src_field, dst_field,
lbm_config, Timestep.ODD, config, True)
odd_call = KernelCallNode(ast_odd)
tree = EvenIntegerCondition('timestep', even_call, odd_call, parameter_dtype=np.uint8)
family = KernelFamily(tree, class_name)
indexed_to_field_name = dict()
for field in all_fields:
indexed_to_field_name[field.name] = f"_data_{field.name}_dp"
context = {
'kernel': family,
'all_fields': all_fields,
'namespace': 'lbm',
'function_name': 'blockStream',
'indexed_to_field_name': indexed_to_field_name,
'max_threads': None
}
return context
def create_block_wise_ast(collision_rule, src_field, dst_field, lbm_config, timestep, config, stream_only):
stencil = lbm_config.stencil
streaming_pattern = lbm_config.streaming_pattern
default_dtype = config.data_type.default_factory()
config = replace(config, gpu_indexing_params={})
accessor = get_accessor(streaming_pattern, timestep)
if stream_only:
update_rule = create_stream_only_kernel(stencil, src_field, dst_field, accessor)
else:
update_rule = create_lbm_kernel(collision_rule, src_field, dst_field, accessor, data_type=default_dtype)
bound_fields = update_rule.bound_fields
free_fields = update_rule.free_fields
all_fields = list(bound_fields.union(free_fields))
all_fields.sort(key=lambda field: field.name)
index = TypedSymbol("index", dtype=BasicType(np.int64))
index_shape = TypedSymbol("_size_0", dtype=BasicType(np.int64))
ass = list()
for field in all_fields:
const = True if field in free_fields else False
ptr_type = PointerType(field.dtype, const=const, restrict=True, double_pointer=True)
ptr = FieldPointerSymbol(field.name, field.dtype, const=const)
f = sp.IndexedBase(TypedSymbol(f"_data_{field.name}_dp", dtype=ptr_type), shape=index_shape)
ass.append(Assignment(ptr, f[index]))
update_rule = ass + update_rule.all_assignments
ast = create_kernel(update_rule, config=config)
base_name = "kernel_BlockStream" if stream_only else "kernel_BlockStreamCollide"
ast.function_name = base_name + timestep_suffix(timestep)
ast.assumed_inner_stride_one = config.cpu_vectorize_info['assume_inner_stride_one']
return ast, all_fields
# def subexpressions_for_field_lbm_writes(ac, lattice_stencil, src_field):
# field_writes = set()
#
# for assignment in ac.all_assignments:
# if hasattr(assignment, 'lhs') and hasattr(assignment, 'rhs'):
# field_writes.update(assignment.lhs.atoms(Field.Access))
#
# field_writes = {f for f in field_writes if f.field.name == src_field.name}
# field_writes = list(field_writes)
# lbm_reads.sort(key=lambda field_read: lattice_stencil.index(field_read.offsets))
#
# substitutions = {fa: sp.Symbol(f"p_{i}") for i, fa in enumerate(lbm_reads)}
# for i, fa in enumerate(other_reads):
# substitutions[fa] = sp.Symbol(f"aleph_{i}")
# return (ac.new_with_substitutions(substitutions, add_substitutions_as_subexpressions=True,
# substitute_on_lhs=False, sort_topologically=False), lbm_reads)
......@@ -41,12 +41,12 @@ class {{class_name}}
enum Type { ALL = 0, INNER = 1, OUTER = 2 };
{{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])], additional_constructor_arguments] | type_identifier_list -}} )
{{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize', 'pdfs'])], additional_constructor_arguments] | type_identifier_list -}} )
: blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
{
{% for object_name, boundary_class, kernel, additional_data_handler in zip(object_names, boundary_classes, kernel_list, additional_data_handlers) -%}
{{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'], use_field_ids=True)], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
{{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize', 'pdfs', 'timestep', 'gpuStream'], use_field_ids=True)], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
{% endfor %}
{% for object_name, flag_uid in zip(object_names, flag_uids) -%}
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file {{class_name}}.cpp
//! \\author pystencils
//======================================================================================================================
#include "{{class_name}}.h"
{% if target is equalto 'cpu' -%}
#define FUNC_PREFIX
{%- elif target is equalto 'gpu' -%}
#define FUNC_PREFIX __global__
{%- endif %}
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-variable"
# pragma GCC diagnostic ignored "-Wignored-qualifiers"
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
#pragma warning push
#pragma warning( disable : 1599 )
#endif
#ifdef __CUDACC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress 191
#else
#pragma diag_suppress 191
#endif
#endif
using namespace std;
namespace walberla {
namespace {{namespace}} {
{%if block_stream_collide -%}
{{block_stream_collide['kernel']|generate_definitions(target, block_stream_collide['max_threads'])}}
{{block_stream['kernel']|generate_definitions(target, block_stream['max_threads'])}}
{%- endif %}
{% for kernel in kernels %}
{{kernel['kernel']|generate_definitions(target, kernel['max_threads'])}}
{% endfor %}
{%if block_stream_collide -%}
void {{class_name}}::{{block_stream_collide['function_name']}}({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}})
{
{%if target is equalto 'gpu' -%}
dim3 _grid = grid_[level];
dim3 _block = block_[level];
{%- for field in block_stream_collide['all_fields'] %}
{{field.dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][field.name]}} = {{field.name}}PointersGPU[level];
{%- endfor %}
{%- for swap_pair in swap_pairs %}
{{swap_pair[1].dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][swap_pair[1].name]}} = {{swap_pair[1].name}}PointersGPU[level];
{%- endfor %}
{% else %}
{%- for field in block_stream_collide['all_fields'] %}
{{field.dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][field.name]}} = {{field.name}}Pointers[level].data();
{%- endfor %}
{%- for swap_pair in swap_pairs %}
{{swap_pair[1].dtype.c_name}} ** {{block_stream_collide['indexed_to_field_name'][swap_pair[1].name]}} = {{swap_pair[1].name}}Pointers[level].data();
{%- endfor %}
{%- endif %}
const int64_t _size_0 = size_0[level];
int64_t _size_{{block_stream_collide['all_fields'][0].name}}_0 = size_1;
int64_t _size_{{block_stream_collide['all_fields'][0].name}}_1 = size_2;
int64_t _size_{{block_stream_collide['all_fields'][0].name}}_2 = size_3;
{{block_stream_collide['kernel']|generate_field_strides()|indent(3)}}
{{block_stream_collide['kernel']|generate_refs_for_kernel_parameters(prefix="this->", parameters_to_ignore=["_size_0"], ignore_fields=True, parameter_registration=parameter_scaling, level_known=True)|indent(3)}}
{{block_stream_collide['kernel']|generate_call(stream='stream', plain_kernel_call=True)|indent(3)}}
{%- for swap_pair in swap_pairs %}
{%if target is equalto 'gpu' -%}
std::swap({{swap_pair[0].name}}PointersGPU[level], {{swap_pair[1].name}}PointersGPU[level]);
{% else %}
std::swap({{swap_pair[0].name}}Pointers[level], {{swap_pair[1].name}}Pointers[level]);
{%- endif %}
{%- endfor %}
}
void {{class_name}}::ghostLayerPropagation({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}})
{
{{block_stream['kernel']|generate_field_strides()|indent(3)}}
{%if target is equalto 'gpu' -%}
auto parallelSection_ = parallelStreams_.parallelSection( stream );
for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
if(it->second.empty()){ continue;}
int64_t _size_0 = int64_c(it->second.size());
int64_t _size_{{pdf_field.name}}_0 = std::get<0>(it->first);
int64_t _size_{{pdf_field.name}}_1 = std::get<1>(it->first);
int64_t _size_{{pdf_field.name}}_2 = std::get<2>(it->first);
double ** _data_{{pdf_field.name}}_dp = glPropagationPDFsGPU[level][it->first];
dim3 _grid = glPropagationGrid_[level][it->first];
dim3 _block = glPropagationBlock_[level][it->first];
parallelSection_.run([&]( auto s ) {
{{block_stream['kernel']|generate_call(stream='s', plain_kernel_call=True)|indent(9)}}
});
}
{% else %}
for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
if(it->second.empty()){ continue;}
int64_t _size_0 = int64_c(it->second.size());
int64_t _size_{{pdf_field.name}}_0 = std::get<0>(it->first);
int64_t _size_{{pdf_field.name}}_1 = std::get<1>(it->first);
int64_t _size_{{pdf_field.name}}_2 = std::get<2>(it->first);
double ** _data_{{pdf_field.name}}_dp = it->second.data();
{{block_stream['kernel']|generate_call(stream='s', plain_kernel_call=True)|indent(6)}}
}
{%- endif %}
}
{%- endif %}
{% for kernel in kernels %}
void {{class_name}}::{{kernel['function_name']}}( {{kernel['kernel']|generate_plain_parameter_list(ghost_layers=True)}} )
{
{{kernel['kernel']|generate_call(ghost_layers_to_include=kernel['ghost_layers_to_include'], stream='stream')|indent(3)}}
}
void {{class_name}}::{{kernel['function_name']}}CellInterval( {{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci')}})
{
{{kernel['kernel']|generate_call(stream='stream', cell_interval='ci')|indent(3)}}
}
{% endfor %}
} // namespace {{namespace}}
} // namespace walberla
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic pop
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
#pragma warning pop
#endif
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \\file {{class_name}}.h
//! \\author pystencils
//======================================================================================================================
#pragma once
#include "blockforest/StructuredBlockForest.h"
#include "blockforest/BlockID.h"
#include "blockforest/Block.h"
#include "core/DataTypes.h"
#include "core/logging/Logging.h"
#include "core/Macros.h"
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/iterators/FieldIterator.h"
{% if target is equalto 'gpu' -%}
#include "gpu/AddGPUFieldToStorage.h"
#include "gpu/GPUField.h"
#include "gpu/ParallelStreams.h"
{%- endif %}
#include "domain_decomposition/BlockDataID.h"
#include "domain_decomposition/IBlock.h"
#include "domain_decomposition/StructuredBlockStorage.h"
#include "field/SwapableCompare.h"
#include "field/GhostLayerField.h"
#include "stencil/Directions.h"
#include "stencil/{{stencil_name}}.h"
#include <set>
#include <cmath>
{% for header in headers %}
#include {{header}}
{% endfor %}
using namespace std::placeholders;
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wunused-parameter"
# pragma GCC diagnostic ignored "-Wreorder"
#endif
namespace walberla {
namespace {{namespace}} {
class {{class_name}}
{
public:
enum Type { ALL = 0, INNER = 1, OUTER = 2 };
using sizeTuple = std::tuple<int64_t, int64_t, int64_t>;
{{class_name}}(const shared_ptr< StructuredBlockForest > & blocks, {{kernel_list|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
: blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth)
{
{{kernel_list|generate_constructor(parameter_registration=parameter_scaling) |indent(6)}}
validInnerOuterSplit_ = true;
for (auto& iBlock : *blocks)
{
if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
validInnerOuterSplit_ = false;
}
}
void initialiseBlockPointer()
{
{%if block_stream_collide -%}
size_0.resize(blocks_->getNumberOfLevels());
{%- for field in block_stream_collide['all_fields'] %}
{{field.name}}Pointers.resize(blocks_->getNumberOfLevels());
{%if target is equalto 'gpu' -%} {{field.name}}PointersGPU.resize(blocks_->getNumberOfLevels()); {% endif %}
{%- endfor %}
{%if target is equalto 'gpu' -%} block_.resize(blocks_->getNumberOfLevels()); {% endif %}
{%if target is equalto 'gpu' -%} grid_.resize(blocks_->getNumberOfLevels()); {% endif %}
glPropagationPDFs.resize(blocks_->getNumberOfLevels());
{%if target is equalto 'gpu' -%} glPropagationPDFsGPU.resize(blocks_->getNumberOfLevels()); {% endif %}
{%if target is equalto 'gpu' -%} glPropagationBlock_.resize(blocks_->getNumberOfLevels()); {% endif %}
{%if target is equalto 'gpu' -%} glPropagationGrid_.resize(blocks_->getNumberOfLevels()); {% endif %}
for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
{
auto* local = dynamic_cast< Block* >(it.get());
{%- for field in block_stream_collide['all_fields'] %}
auto {{field.name}} = local->getData< {{field | field_type(is_gpu=is_gpu)}} >({{field.name}}ID);
{%- endfor %}
size_1 = int64_c({{block_stream_collide['all_fields'][0].name}}->xSize());
size_2 = int64_c({{block_stream_collide['all_fields'][0].name}}->ySize());
size_3 = int64_c({{block_stream_collide['all_fields'][0].name}}->zSize());
{%- for field in block_stream_collide['all_fields'] %}
stride_{{field.name}}_0 = int64_c({{field.name}}->xStride());
stride_{{field.name}}_1 = int64_c({{field.name}}->yStride());
stride_{{field.name}}_2 = int64_c({{field.name}}->zStride());
stride_{{field.name}}_3 = int64_c(1 * int64_c({{field.name}}->fStride()));
{%- endfor %}
break;
}
for( auto it = blocks_->begin(); it != blocks_->end(); ++it )
{
auto* local = dynamic_cast< Block* >(it.get());
const uint_t level = local->getLevel();
{%- for field in block_stream_collide['all_fields'] %}
auto {{field.name}} = local->getData< {{field | field_type(is_gpu=is_gpu)}} >({{field.name}}ID);
{{field.name}}Pointers[level].emplace_back({{field.name}}->dataAt(0, 0, 0, 0));
{%- endfor %}
for(auto dir = stencil::{{stencil_name}}::beginNoCenter(); dir != stencil::{{stencil_name}}::end(); ++dir){
uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
// Propagate on ghost layers shadowing coarse or no blocks
if(local->neighborhoodSectionHasLargerBlock(nSecIdx)){
CellInterval ci;
{{pdf_field.name}}->getGhostRegion(*dir, ci, 1);
sizeTuple dirTuple = std::make_tuple(int64_c(ci.xSize()), int64_c(ci.ySize()), int64_c(ci.zSize()));
glPropagationPDFs[level][dirTuple].push_back({{pdf_field.name}}->dataAt(ci.xMin(), ci.yMin(), ci.zMin(), 0));
}
}
}
for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++) {
size_0[level] = int64_c({{pdf_field.name}}Pointers[level].size());
{%if target is equalto 'gpu' -%}
int64_t indexingX = size_1 * size_0[level];
int64_t indexingY = size_2;
int64_t indexingZ = size_3;
int64_t cudaBlockSize0 = 128;
int64_t cudaBlockSize1 = 1;
int64_t cudaBlockSize2 = 1;
block_[level] = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
grid_[level] = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
{%- for field in block_stream_collide['all_fields'] %}
WALBERLA_GPU_CHECK(gpuMalloc( (void**)&{{field.name}}PointersGPU[level], sizeof(double* ) * {{field.name}}Pointers[level].size() ));
WALBERLA_GPU_CHECK(gpuMemcpy( {{field.name}}PointersGPU[level], &{{field.name}}Pointers[level][0], sizeof(double *) * {{field.name}}Pointers[level].size(), gpuMemcpyHostToDevice ));
{%- endfor %}
for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
if(it->second.empty()){ continue;}
indexingX = std::get<0>(it->first) * int64_c(it->second.size());
indexingY = std::get<1>(it->first);
indexingZ = std::get<2>(it->first);
cudaBlockSize0 = 32;
cudaBlockSize1 = 1;
cudaBlockSize2 = 1;
glPropagationBlock_[level][it->first] = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
glPropagationGrid_[level][it->first] = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
WALBERLA_GPU_CHECK(gpuMalloc( (void**)&glPropagationPDFsGPU[level][it->first], sizeof(double* ) * it->second.size() ));
WALBERLA_GPU_CHECK(gpuMemcpy( glPropagationPDFsGPU[level][it->first], &it->second[0], sizeof(double* ) * it->second.size(), gpuMemcpyHostToDevice ));
}
{% endif %}
}
{% endif %}
};
~{{class_name}}() {
{%if target is equalto 'gpu' and block_stream_collide -%}
for (uint_t level = 0; level < blocks_->getNumberOfLevels(); level++)
{
{%- for field in block_stream_collide['all_fields'] %}
if(!{{field.name}}Pointers[level].empty()){
WALBERLA_GPU_CHECK(gpuFree({{field.name}}PointersGPU[level]));
}
{%- endfor %}
for (auto it = glPropagationPDFs[level].begin(); it != glPropagationPDFs[level].end(); it++){
if(it->second.empty()){ continue;}
WALBERLA_GPU_CHECK(gpuFree(glPropagationPDFsGPU[level][it->first]));
}
}
{%- endif %}
}
/*************************************************************************************
* Internal Function Definitions with raw Pointer
*************************************************************************************/
{%if block_stream_collide -%}
void blockStreamCollide({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}});
void ghostLayerPropagation({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}});
{%- endif %}
{%- for kernel in kernels %}
static void {{kernel['function_name']}} ({{kernel['kernel']|generate_plain_parameter_list(ghost_layers=0, stream="nullptr")}});
static void {{kernel['function_name']}}CellInterval ({{kernel['kernel']|generate_plain_parameter_list(cell_interval='ci', stream="nullptr")}});
{% endfor %}
/*************************************************************************************
* Function Definitions for external Usage
*************************************************************************************/
std::function<void ()> blockStreamCollideFck({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
{
{%if block_stream_collide -%}
return [{{- ["this", "level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}}](){
blockStreamCollide({{- ["level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
};
{% else %}
WALBERLA_ABORT("Block wise stream collide is only generated when inplace streaming patterns are used")
{%- endif %}
}
void streamCollideOverBlocks({{- ["uint_t level", "uint8_t timestep", ["gpuStream_t stream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}} )
{
{%if block_stream_collide -%}
blockStreamCollide({{- ["level", "timestep", ["stream"] if target == 'gpu' else []] | type_identifier_list -}});
{% else %}
WALBERLA_ABORT("Block wise stream collide is only generated when inplace streaming patterns are used")
{%- endif %}
}
{%- for kernel in kernels %}
std::function<void (IBlock *)> {{kernel['function_name']}}()
{
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
}
std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", ] | type_identifier_list -}})
{
if (!validInnerOuterSplit_ && type != Type::ALL)
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
switch (type)
{
case Type::INNER:
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
case Type::OUTER:
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
default:
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
}
}
std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
{
if (!validInnerOuterSplit_ && type != Type::ALL)
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
switch (type)
{
case Type::INNER:
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
case Type::OUTER:
return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
default:
return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); };
}
}
{% if target is equalto 'gpu' -%}
std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
{
if (!validInnerOuterSplit_ && type != Type::ALL)
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
switch (type)
{
case Type::INNER:
return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
case Type::OUTER:
return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
default:
return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); };
}
}
std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "gpuStream_t gpuStream"] | type_identifier_list -}})
{
if (!validInnerOuterSplit_ && type != Type::ALL)
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
switch (type)
{
case Type::INNER:
return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
case Type::OUTER:
return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
default:
return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); };
}
}
{%- endif %}
void {{kernel['function_name']}}({{- ["IBlock * block",] | type_identifier_list -}})
{
const cell_idx_t ghost_layers = 0;
{% if target is equalto 'gpu' -%}
gpuStream_t gpuStream = nullptr;
{%- endif %}
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements|indent(6)}}
{{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
{{kernel['kernel']|generate_swaps|indent(6)}}
}
void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
{
{% if target is equalto 'gpu' -%}
gpuStream_t gpuStream = nullptr;
{%- endif %}
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements|indent(6)}}
{{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
{{kernel['kernel']|generate_swaps|indent(6)}}
}
{% if target is equalto 'gpu' -%}
void {{kernel['function_name']}}({{- ["IBlock * block", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
{
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements|indent(6)}}
{{kernel['function_name']}}({{kernel['kernel']|generate_function_collection_call(ghost_layers='ghost_layers')}});
{{kernel['kernel']|generate_swaps|indent(6)}}
}
{%- endif %}
void {{kernel['function_name']}}CellInterval({{- ["IBlock * block", "const CellInterval & ci", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
{
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements|indent(6)}}
{{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
{{kernel['kernel']|generate_swaps|indent(6)}}
}
void {{kernel['function_name']}}Inner({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
{
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements(advance=False)|indent(6)}}
CellInterval inner = {{kernel['field']}}->xyzSize();
inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
{{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='inner')}});
}
void {{kernel['function_name']}}Outer({{- ["IBlock * block", ["gpuStream_t gpuStream = nullptr"] if target == 'gpu' else []] | type_identifier_list -}})
{
{{kernel['kernel']|generate_block_data_to_field_extraction|indent(6)}}
{{kernel['kernel']|generate_refs_for_kernel_parameters(prefix='this->', ignore_fields=True, parameter_registration=parameter_scaling)|indent(6)}}
{{kernel['kernel']|generate_timestep_advancements|indent(6)}}
if( layers_.empty() )
{
CellInterval ci;
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
layers_.push_back(ci);
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
layers_.push_back(ci);
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
ci.expand(Cell(0, 0, -outerWidth_[2]));
layers_.push_back(ci);
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
ci.expand(Cell(0, 0, -outerWidth_[2]));
layers_.push_back(ci);
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
layers_.push_back(ci);
{{kernel['field']}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
layers_.push_back(ci);
}
{%if target is equalto 'gpu'%}
{
auto parallelSection_ = parallelStreams_.parallelSection( gpuStream );
for( auto & ci: layers_ )
{
parallelSection_.run([&]( auto s ) {
{{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
});
}
}
{% else %}
for( auto & ci: layers_ )
{
{{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
}
{% endif %}
{{kernel['kernel']|generate_swaps|indent(9)}}
}
{% endfor %}
{%if target is equalto 'gpu'%}
void setOuterPriority(int priority)
{
parallelStreams_.setStreamPriority(priority);
}
{%endif%}
private:
shared_ptr< StructuredBlockForest > blocks_;
{{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}}
Cell outerWidth_;
std::vector<CellInterval> layers_;
bool validInnerOuterSplit_;
{%if target is equalto 'gpu' -%}
gpu::ParallelStreams parallelStreams_;
{%- endif %}
{%if block_stream_collide -%}
std::vector<int64_t> size_0;
int64_t size_1;
int64_t size_2;
int64_t size_3;
{%- for field in block_stream_collide['all_fields'] %}
int64_t stride_{{field.name}}_0;
int64_t stride_{{field.name}}_1;
int64_t stride_{{field.name}}_2;
int64_t stride_{{field.name}}_3;
{% endfor %}
{%- for field in block_stream_collide['all_fields'] %}
std::vector<std::vector<{{field.dtype.c_name}} *>> {{field.name}}Pointers;
{% endfor -%}
std::vector<std::map<sizeTuple, std::vector<double *>>> glPropagationPDFs;
{%if target is equalto 'gpu' -%}
{%- for field in block_stream_collide['all_fields'] %}
std::vector<{{field.dtype.c_name}} **> {{field.name}}PointersGPU;
{% endfor -%}
std::vector<dim3> block_;
std::vector<dim3> grid_;
std::vector<std::map<sizeTuple, double **>> glPropagationPDFsGPU;
std::vector<std::map<sizeTuple, dim3>> glPropagationBlock_;
std::vector<std::map<sizeTuple, dim3>> glPropagationGrid_;
{%- endif %}
{%- endif %}
};
} // namespace {{namespace}}
} // namespace walberla
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic pop
#endif
......@@ -19,12 +19,45 @@
#include "{{class_name}}.h"
{% if target is equalto 'cpu' -%}
#define FUNC_PREFIX
{%- elif target is equalto 'gpu' -%}
#define FUNC_PREFIX __global__
#include "gpu/GPUWrapper.h"
#include "gpu/GPUField.h"
{%- endif %}
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# pragma GCC diagnostic ignored "-Wshadow"
# pragma GCC diagnostic ignored "-Wconversion"
# pragma GCC diagnostic ignored "-Wunused-variable"
# pragma GCC diagnostic ignored "-Wignored-qualifiers"
#endif
#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
#pragma warning push
#pragma warning( disable : 1599 )
#endif
#ifdef __CUDACC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress 191
#else
#pragma diag_suppress 191
#endif
#endif
namespace walberla {
......@@ -42,6 +75,8 @@ namespace {{namespace}} {
{{ kernels['localCopyDirection'] | generate_definitions }}
{% if nonuniform -%}
{{ kernels['localCopyRedistribute'] | generate_definitions }}
{{ kernels['localPartialCoalescence'] | generate_definitions }}
{{ kernels['unpackRedistribute'] | generate_definitions }}
{{ kernels['packPartialCoalescence'] | generate_definitions }}
{{ kernels['zeroCoalescenceRegion'] | generate_definitions }}
......@@ -89,7 +124,7 @@ namespace {{namespace}} {
WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
{{kernels['localCopyAll']
| generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
| generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
| indent(6) }}
}
......@@ -115,6 +150,56 @@ namespace {{namespace}} {
{{kernels['unpackDirection'] | generate_call(cell_interval='ci', stream='stream') | indent(6) }}
}
void {{class_name}}::PackKernels::localCopyDirection(
{{- [src_field.dtype.c_name + "** _data_" + src_field.name + "_dp", dst_field.dtype.c_name + "** _data_" + dst_field.name + "_dp",
kernels['localCopyDirection'].kernel_selection_parameters,
["gpuStream_t stream"] if is_gpu else [], "std::array<int64_t, 4>& _sizes", "std::array<int64_t, 4>& _strides"]
| type_identifier_list -}}
) const {
{% if block_wise -%}
{% if target is equalto 'gpu' -%}
const int64_t indexingX = _sizes[0] * _sizes[1];
const int64_t indexingY = _sizes[2];
const int64_t indexingZ = _sizes[3];
const int64_t cudaBlockSize0 = 128;
const int64_t cudaBlockSize1 = 1;
const int64_t cudaBlockSize2 = 1;
const int64_t _size_0 = _sizes[0];
const int64_t _size_{{src_field.name}}_0 = _sizes[1];
const int64_t _size_{{src_field.name}}_1 = _sizes[2];
const int64_t _size_{{src_field.name}}_2 = _sizes[3];
const int64_t _size_{{dst_field.name}}_0 = _sizes[1];
const int64_t _size_{{dst_field.name}}_1 = _sizes[2];
const int64_t _size_{{dst_field.name}}_2 = _sizes[3];
const int64_t _stride_{{src_field.name}}_0 = _strides[0];
const int64_t _stride_{{src_field.name}}_1 = _strides[1];
const int64_t _stride_{{src_field.name}}_2 = _strides[2];
const int64_t _stride_{{src_field.name}}_3 = _strides[3];
const int64_t _stride_{{dst_field.name}}_0 = _strides[0];
const int64_t _stride_{{dst_field.name}}_1 = _strides[1];
const int64_t _stride_{{dst_field.name}}_2 = _strides[2];
const int64_t _stride_{{dst_field.name}}_3 = _strides[3];
const dim3 _block = dim3((unsigned int)((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)), (unsigned int)((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))), (unsigned int)((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))));
const dim3 _grid = dim3((unsigned int)(( (indexingX) % (((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) == 0 ? (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) : ( (int64_t)(indexingX) / (int64_t)(((1024 < ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)) ? 1024 : ((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))) ) +1 )), (unsigned int)(( (indexingY) % (((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) == 0 ? (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) : ( (int64_t)(indexingY) / (int64_t)(((1024 < ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))) ? 1024 : ((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))) ) +1 )), (unsigned int)(( (indexingZ) % (((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) == 0 ? (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) : ( (int64_t)(indexingZ) / (int64_t)(((64 < ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))))))) ? 64 : ((indexingZ < cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))) ? indexingZ : cudaBlockSize2*((int64_t)(cudaBlockSize0*cudaBlockSize1) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)*((indexingY < cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0)))) ? indexingY : cudaBlockSize1*((int64_t)(cudaBlockSize0) / (int64_t)(((indexingX < cudaBlockSize0) ? indexingX : cudaBlockSize0))))))))) ) +1 )));
{%- endif %}
{{kernels['localCopyDirection']
| generate_call(plain_kernel_call=True, stream='stream')
| indent(6) }}
{%else%}
WALBERLA_ABORT("Block wise local communication is not implemented")
{%- endif %}
}
void {{class_name}}::PackKernels::localCopyDirection(
{{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
......@@ -123,16 +208,42 @@ namespace {{namespace}} {
| type_identifier_list -}}
) const
{
{% if not block_wise -%}
WALBERLA_ASSERT_EQUAL(srcInterval.xSize(), dstInterval.xSize())
WALBERLA_ASSERT_EQUAL(srcInterval.ySize(), dstInterval.ySize())
WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
{{kernels['localCopyDirection']
| generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
| generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream')
| indent(6) }}
{%else%}
WALBERLA_ABORT("Local communication is only implemented block wise")
{%- endif %}
}
{% if nonuniform -%}
void {{class_name}}::PackKernels::localCopyRedistribute(
{{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
["gpuStream_t stream"] if is_gpu else []]
| type_identifier_list -}}
) const
{
{{kernels['localCopyRedistribute'] | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
}
void {{class_name}}::PackKernels::localPartialCoalescence(
{{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
["gpuStream_t stream"] if is_gpu else []]
| type_identifier_list -}}
) const
{
{{kernels['localPartialCoalescence'] | generate_call(cell_interval={src_field.name : 'srcInterval', mask_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }}
}
void {{class_name}}::PackKernels::unpackRedistribute(
{{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci",
"unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters,
......
......@@ -30,21 +30,13 @@
#include "stencil/{{stencil_name}}.h"
#include "stencil/Directions.h"
{% if target is equalto 'cpu' -%}
#define FUNC_PREFIX
{%- elif target is equalto 'gpu' -%}
#define FUNC_PREFIX __global__
{% if target is equalto 'gpu' -%}
#include "gpu/GPUWrapper.h"
#include "gpu/GPUField.h"
{%- endif %}
#ifdef __GNUC__
#define RESTRICT __restrict__
#elif _MSC_VER
#define RESTRICT __restrict
#else
#define RESTRICT
#endif
#include <array>
#if defined WALBERLA_CXX_COMPILER_IS_GNU || defined WALBERLA_CXX_COMPILER_IS_CLANG
#pragma GCC diagnostic push
......@@ -178,6 +170,13 @@ class {{class_name}}
/** Copies data between two blocks on the same process.
* PDFs streaming aligned with the direction dir are copied from the sending interval onto the receiving interval.
* */
void localCopyDirection(
{{- [src_field.dtype.c_name + "** _data_" + src_field.name + "_dp", dst_field.dtype.c_name + "** _data_" + dst_field.name + "_dp",
kernels['localCopyDirection'].kernel_selection_parameters,
["gpuStream_t stream"] if is_gpu else [], "std::array<int64_t, 4>& _sizes", "std::array<int64_t, 4>& _strides"]
| type_identifier_list -}}
) const;
void localCopyDirection(
{{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval",
......@@ -186,6 +185,7 @@ class {{class_name}}
| type_identifier_list -}}
) const;
/**
* Returns the number of bytes that will be packed from / unpacked to the cell interval
* when using packDirection / unpackDirection
......@@ -209,6 +209,26 @@ class {{class_name}}
{% if nonuniform -%}
/**
* Local uniform redistribute.
* */
void localCopyRedistribute(
{{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters,
["gpuStream_t stream = nullptr"] if is_gpu else []]
| type_identifier_list -}}
) const;
/**
* Local partial coalescence.
* */
void localPartialCoalescence(
{{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval",
"PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters,
["gpuStream_t stream = nullptr"] if is_gpu else []]
| type_identifier_list -}}
) const;
/**
* Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid.
* */
......
......@@ -3,7 +3,6 @@ from typing import Callable, List, Dict
from pystencils import Target, Field
from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
from lbmpy.relaxationrates import get_shear_relaxation_rate
from pystencils_walberla.cmake_integration import CodeGenerationContext
......@@ -20,7 +19,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
target: Target = Target.CPU,
data_type=None, pdfs_data_type=None,
cpu_openmp=None, cpu_vectorize_info=None,
max_threads=None,
max_threads=None, set_pre_collision_pdfs=True,
**kernel_parameters):
if macroscopic_fields is None:
......@@ -35,7 +34,7 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
cpu_openmp=cpu_openmp)
if nonuniform:
omega = get_shear_relaxation_rate(method)
omega = lbm_config.relaxation_rate
refinement_scaling = RefinementScaling()
refinement_scaling.add_standard_relaxation_rate_scaling(omega)
else:
......@@ -47,14 +46,15 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
macroscopic_fields=macroscopic_fields,
target=target, data_type=data_type,
cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info,
max_threads=max_threads,
max_threads=max_threads, set_pre_collision_pdfs=set_pre_collision_pdfs,
**kernel_parameters)
spatial_shape = None
if lbm_optimisation.symbolic_field and lbm_optimisation.symbolic_field.has_fixed_shape:
spatial_shape = lbm_optimisation.symbolic_field.spatial_shape + (lbm_config.stencil.Q, )
generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
lb_method=method, field_name='pdfs', spatial_shape=spatial_shape,
streaming_pattern=lbm_config.streaming_pattern,
target=target, layout=lbm_optimisation.field_layout)
if boundaries is not None:
generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
lb_method=method, field_name='pdfs', spatial_shape=spatial_shape,
streaming_pattern=lbm_config.streaming_pattern,
target=target, layout=lbm_optimisation.field_layout)
from .boundary import generate_staggered_boundary, generate_staggered_flux_boundary
from .boundary import generate_boundary, generate_staggered_boundary, generate_staggered_flux_boundary
from .cmake_integration import CodeGeneration, ManualCodeGenerationContext
from .function_generator import function_generator
......@@ -8,7 +8,7 @@ from .pack_info import (generate_pack_info, generate_pack_info_for_field,
generate_pack_info_from_kernel, generate_mpidtype_info_from_kernel)
from .utility import generate_info_header, get_vectorize_instruction_set, config_from_context
__all__ = ['generate_staggered_boundary', 'generate_staggered_flux_boundary',
__all__ = ['generate_boundary', 'generate_staggered_boundary', 'generate_staggered_flux_boundary',
'CodeGeneration', 'ManualCodeGenerationContext',
'function_generator',
'generate_sweep', 'generate_selective_sweep', 'generate_sweep_collection',
......
......@@ -67,8 +67,15 @@ def generate_boundary(generation_context,
if not kernel_creation_function:
kernel_creation_function = create_boundary_kernel
kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
target=target, **create_kernel_params)
if boundary_object.calculate_force_on_boundary:
force_vector_type = np.dtype([(f"F_{i}", np.float64) for i in range(dim)], align=True)
force_vector = Field('forceVector', FieldType.INDEXED, force_vector_type, layout=[0],
shape=(TypedSymbol("forceVectorSize", create_type("int32")), 1), strides=(1, 1))
kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
target=target, force_vector=force_vector, **create_kernel_params)
else:
kernel = kernel_creation_function(field, index_field, neighbor_stencil, boundary_object,
target=target, **create_kernel_params)
if isinstance(kernel, KernelFunction):
kernel.function_name = f"boundary_{boundary_object.name}"
......@@ -103,7 +110,8 @@ def generate_boundary(generation_context,
'additional_data_handler': additional_data_handler,
'dtype': "double" if is_float else "float",
'layout': layout,
'index_shape': index_shape
'index_shape': index_shape,
'calculate_force': boundary_object.calculate_force_on_boundary
}
env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
......
......@@ -9,7 +9,7 @@ import sympy as sp
from pystencils import Target, Backend
from pystencils.backends.cbackend import generate_c
from pystencils.typing import TypedSymbol, get_base_type
from pystencils.typing import TypedSymbol, get_base_type, PointerType
from pystencils.field import FieldType
from pystencils.sympyextensions import prod
......@@ -226,17 +226,23 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno
def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore=(), ignore_fields=False,
parameter_registration=None):
parameter_registration=None, level_known=False):
pointer_symbols = {p.symbol.name for p in kernel_info.parameters
if not p.is_field_parameter and isinstance(p.symbol.dtype, PointerType)}
symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer and not ignore_fields}
symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter)
symbols.difference_update(parameters_to_ignore)
if ignore_fields:
symbols.difference_update(pointer_symbols)
type_information = {p.symbol.name: p.symbol.dtype for p in kernel_info.parameters if not p.is_field_parameter}
result = []
registered_parameters = [] if not parameter_registration else parameter_registration.scaling_info
for s in symbols:
if s in registered_parameters:
dtype = type_information[s].c_name
result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
if not level_known:
result.append("const uint_t level = block->getBlockStorage().getLevel(*block);")
result.append(f"{dtype} & {s} = {s}Vector[level];")
else:
result.append(f"auto & {s} = {prefix}{s}_;")
......@@ -245,7 +251,7 @@ def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignor
@jinja2_context_decorator
def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, stream='0',
spatial_shape_symbols=()):
spatial_shape_symbols=(), plain_kernel_call=False):
"""Generates the function call to a pystencils kernel
Args:
......@@ -265,6 +271,20 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
parameters - however in special cases like boundary conditions a manual specification
may be necessary.
"""
ast_params = kernel.parameters
if len(spatial_shape_symbols) == 0:
for param in ast_params:
if param.is_field_parameter and FieldType.is_indexed(param.fields[0]):
continue
if param.is_field_pointer:
field = param.fields[0]
if field.has_fixed_shape:
spatial_shape_symbols = field.spatial_shape
if plain_kernel_call:
return kernel.generate_kernel_invocation_code(plain_kernel_call=True, stream=stream,
spatial_shape_symbols=spatial_shape_symbols)
assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
ast_params = kernel.parameters
vec_info = ctx.get('cpu_vectorize_info', None)
......@@ -296,7 +316,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st
if isinstance(cell_interval, str):
return cell_interval
elif isinstance(cell_interval, dict):
return cell_interval[field_object]
return cell_interval[field_object.name]
else:
return None
......@@ -591,7 +611,8 @@ def generate_members(ctx, kernel_infos, parameters_to_ignore=None, only_fields=F
original_field_name = field_name[:-len('_tmp')]
f_size = get_field_fsize(f)
field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
result.append(temporary_fieldMemberTemplate.format(type=field_type,
original_field_name=original_field_name))
for kernel_info in kernel_infos:
if hasattr(kernel_info, 'varying_parameters'):
......@@ -734,6 +755,16 @@ def generate_constructor(ctx, kernel_infos, parameter_registration):
return "\n".join(result)
@jinja2_context_decorator
def generate_field_strides(ctx, kernel_info):
result = []
for param in kernel_info.parameters:
if param.is_field_stride:
type_str = param.symbol.dtype.c_name
result.append(f"const {type_str} {param.symbol.name} = {param.symbol.name[1:]};")
return "\n".join(result)
def generate_list_of_expressions(expressions, prepend=''):
if len(expressions) == 0:
return ''
......@@ -806,3 +837,4 @@ def add_pystencils_filters_to_jinja_env(jinja_env):
jinja_env.filters['identifier_list'] = identifier_list
jinja_env.filters['list_of_expressions'] = generate_list_of_expressions
jinja_env.filters['field_type'] = field_type
jinja_env.filters['generate_field_strides'] = generate_field_strides
......@@ -32,13 +32,20 @@ class KernelInfo:
all_headers = [list(get_headers(self.ast))]
return reduce(merge_sorted_lists, all_headers)
def generate_kernel_invocation_code(self, **kwargs):
def generate_kernel_invocation_code(self, plain_kernel_call=False, **kwargs):
ast = self.ast
ast_params = self.parameters
fnc_name = ast.function_name
is_cpu = self.ast.target == Target.CPU
call_parameters = ", ".join([p.symbol.name for p in ast_params])
if plain_kernel_call:
if is_cpu:
return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
else:
stream = kwargs.get('stream', '0')
return f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
if not is_cpu:
stream = kwargs.get('stream', '0')
spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())
......
......@@ -157,6 +157,21 @@ class SwitchNode(AbstractKernelSelectionNode):
return switch_code
class AbortNode(AbstractKernelSelectionNode):
def __init__(self, message):
self.message = message
@property
def selection_parameters(self):
return set()
def collect_kernel_calls(self):
return set()
def get_code(self, **kwargs):
return f'WALBERLA_ABORT("{self.message}")'
class KernelCallNode(AbstractKernelSelectionNode):
def __init__(self, ast):
self.ast = ast
......@@ -169,13 +184,20 @@ class KernelCallNode(AbstractKernelSelectionNode):
def collect_kernel_calls(self):
return {self}
def get_code(self, **kwargs):
def get_code(self, plain_kernel_call=False, **kwargs):
ast = self.ast
ast_params = self.parameters
fnc_name = ast.function_name
is_cpu = self.ast.target == Target.CPU
call_parameters = ", ".join([p.symbol.name for p in ast_params])
if plain_kernel_call:
if is_cpu:
return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
else:
stream = kwargs.get('stream', '0')
return f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
if not is_cpu:
stream = kwargs.get('stream', '0')
spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ())
......
......@@ -82,10 +82,23 @@ void {{class_name}}::run_impl(
uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
{{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
{% if calculate_force -%}
auto * forceVector = block->getData<ForceVector>(forceVectorID);
WALBERLA_ASSERT_EQUAL(indexVectorSize, int32_c( forceVector->forceVector().size() ))
{% if target == 'gpu' -%}
auto forcePointer = forceVector->pointerGpu();
int32_t forceVectorSize = int32_c( forceVector->forceVector().size() );
{% else %}
auto forcePointer = forceVector->pointerCpu();
{% endif %}
uint8_t * _data_forceVector = reinterpret_cast<uint8_t*>(forcePointer);
{%- endif %}
{{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize', 'forceVector', 'forceVectorSize'])|indent(4)}}
{{kernel|generate_timestep_advancements|indent(4)}}
{{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize'], ignore_fields=True)|indent(4) }}
{{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
{{kernel|generate_refs_for_kernel_parameters(prefix='', parameters_to_ignore=['indexVectorSize', 'forceVectorSize'], ignore_fields=True)|indent(4) }}
{{kernel|generate_call(spatial_shape_symbols=['indexVectorSize', 'forceVectorSize'], stream='stream')|indent(4)}}
}
void {{class_name}}::run(
......