Skip to content
Snippets Groups Projects
Commit a385588d authored by Helen Schottenhamml's avatar Helen Schottenhamml
Browse files

Merge branch 'UniformGridCPU' into 'master'

UniformGridCPU

See merge request walberla/walberla!516
parents ca42e06d a5a51ac0
No related branches found
No related tags found
No related merge requests found
Showing
with 933 additions and 658 deletions
......@@ -22,7 +22,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
if ( WALBERLA_BUILD_WITH_CODEGEN )
add_subdirectory( FlowAroundSphereCodeGen )
add_subdirectory( UniformGridGenerated )
add_subdirectory( UniformGridCPU )
add_subdirectory( PhaseFieldAllenCahn )
endif()
......
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
FILE UniformGridCPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridCPU_LbKernel.cpp UniformGridCPU_LbKernel.h
UniformGridCPU_PackInfoEven.cpp UniformGridCPU_PackInfoEven.h
UniformGridCPU_PackInfoOdd.cpp UniformGridCPU_PackInfoOdd.h
UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h
UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h
UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h
UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h
UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
UniformGridCPU_InfoHeader.h
)
waLBerla_add_executable(NAME UniformGridCPU_${config}
FILES UniformGridCPU.cpp
DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk UniformGridCPUGenerated_${config})
# all configs are excluded from all except for pull d3q27.
if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
else()
set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
endforeach ()
endforeach()
endforeach()
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridCPU.cpp
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "core/Environment.h"
#include "core/OpenMP.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "domain_decomposition/SharedSweep.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h"
#include "lbm/communication/CombinedInPlaceCpuPackInfo.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
#include "timeloop/all.h"
#include <iomanip>
#include "InitShearVelocity.h"
#include "ManualKernels.h"
#include "UniformGridCPU_InfoHeader.h"
using namespace walberla;
using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven;
using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd;
using LbSweep = lbm::UniformGridCPU_LbKernel;
using FlagField_T = FlagField< uint8_t >;
auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) {
return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block),
storage->getNumberOfZCells(*block), uint_t(1), field::fzyx,
make_shared< field::AllocateAligned< real_t, 64 > >());
};
int main(int argc, char** argv)
{
mpi::Environment env(argc, argv);
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{
WALBERLA_MPI_WORLD_BARRIER()
auto config = *cfg;
logging::configureLogging(config);
auto blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs");
BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_t(1.0), field::fzyx);
// Initialize velocity on cpu
if (initShearFlow)
{
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldId);
}
pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId);
pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId);
// Set up initial PDF values
for (auto& block : *blocks)
setterSweep(&block);
Vector3< int > innerOuterSplit =
parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
for (uint_t i = 0; i < 3; ++i)
{
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
}
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell);
pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
// Boundaries
const FlagUID fluidFlagUID("Fluid");
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock("Boundaries");
bool boundaries = false;
if (boundariesConfig)
{
WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
boundaries = true;
geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
}
lbm::UniformGridCPU_NoSlip noSlip(blocks, pdfFieldId);
noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
lbm::UniformGridCPU_UBB ubb(blocks, pdfFieldId);
ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0);
auto packInfo =
make_shared< lbm::CombinedInPlaceCpuPackInfo< PackInfoEven_T , PackInfoOdd_T > >(tracker, pdfFieldId);
blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
communication.addPackInfo(packInfo);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto boundarySweep = [&](IBlock* block, uint8_t t) {
noSlip.run(block, t);
ubb.run(block, t);
};
auto boundaryInner = [&](IBlock* block, uint8_t t) {
noSlip.inner(block, t);
ubb.inner(block, t);
};
auto boundaryOuter = [&](IBlock* block, uint8_t t) {
noSlip.outer(block, t);
ubb.outer(block, t);
};
auto simpleOverlapTimeStep = [&]() {
// Communicate post-collision values of previous timestep...
communication.startCommunication();
for (auto& block : *blocks)
{
if (boundaries) boundaryInner(&block, tracker->getCounter());
lbSweep.inner(&block, tracker->getCounterPlusOne());
}
communication.wait();
for (auto& block : *blocks)
{
if (boundaries) boundaryOuter(&block, tracker->getCounter());
lbSweep.outer(&block, tracker->getCounterPlusOne());
}
tracker->advance();
};
auto normalTimeStep = [&]() {
communication.communicate();
for (auto& block : *blocks)
{
if (boundaries) boundarySweep(&block, tracker->getCounter());
lbSweep(&block, tracker->getCounterPlusOne());
}
tracker->advance();
};
// With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
// with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
auto kernelOnlyFunc = [&]() {
tracker->advance();
for (auto& block : *blocks)
lbSweep(&block, tracker->getCounter());
};
// Stream only function to test a streaming pattern without executing lbm operations inside
auto StreamOnlyFunc = [&]() {
for (auto& block : *blocks)
StreamOnlyKernel(&block);
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
std::function< void() > timeStep;
if (timeStepStrategy == "noOverlap")
timeStep = std::function< void() >(normalTimeStep);
else if (timeStepStrategy == "simpleOverlap")
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly")
{
WALBERLA_LOG_INFO_ON_ROOT(
"Running only compute kernel without boundary - this makes only sense for benchmarking!")
// Run initial communication once to provide any missing stream-in populations
communication.communicate();
timeStep = kernelOnlyFunc;
}
else if (timeStepStrategy == "StreamOnly")
{
WALBERLA_LOG_INFO_ON_ROOT(
"Running only streaming kernel without LBM - this makes only sense for benchmarking!")
// Run initial communication once to provide any missing stream-in populations
timeStep = StreamOnlyFunc;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
"'simpleOverlap', 'kernelOnly'")
}
timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 0)
{
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0);
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldId, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks){
getterSweep(&block);}
});
timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// BENCHMARK ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int warmupSteps = parameters.getParameter< int >("warmupSteps", 2);
int outerIterations = parameters.getParameter< int >("outerIterations", 1);
for (int i = 0; i < warmupSteps; ++i)
timeLoop.singleStep();
real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", -1.0); // in seconds
if (remainingTimeLoggerFrequency > 0)
{
auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
remainingTimeLoggerFrequency);
timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
}
for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
simTimer.start();
timeLoop.run();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
real_t time = simTimer.last();
WALBERLA_MPI_SECTION()
{
walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
}
auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
return EXIT_SUCCESS;
}
from dataclasses import replace
import sympy as sp
import pystencils as ps
from pystencils.fast_approximation import insert_fast_divisions, insert_fast_sqrts
from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
insert_symbol_times_minus_one
from lbmpy.advanced_streaming import Timestep, is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
from lbmpy.enums import Method, Stencil
from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
from lbmpy.updatekernels import create_stream_only_kernel
from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
generate_mpidtype_info_from_kernel, generate_info_header
from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary, generate_lb_pack_info
omega = sp.symbols('omega')
omega_free = sp.Symbol('omega_free')
# best configs in terms of FLOPS
options_dict = {
'srt': {
'method': Method.SRT,
'relaxation_rate': omega,
'compressible': False,
},
'trt': {
'method': Method.TRT,
'relaxation_rate': omega,
'compressible': False,
},
'mrt': {
'method': Method.MRT,
'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'compressible': False,
},
'mrt-overrelax': {
'method': Method.MRT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': False,
},
'central': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rate': omega,
'compressible': True,
},
'central-overrelax': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
},
'cumulant': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega,
'compressible': True,
},
'cumulant-overrelax': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'entropic': {
'method': Method.TRT_KBC_N4,
'compressible': True,
'relaxation_rates': [omega, omega_free],
'entropic': True,
'entropic_newton_iterations': False
},
'smagorinsky': {
'method': Method.SRT,
'smagorinsky': False,
'relaxation_rate': omega,
}
}
info_header = """
const char * infoStencil = "{stencil}";
const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
# DEFAULTS
optimize = True
with CodeGeneration() as ctx:
openmp = True if ctx.openmp else False
field_type = "float64" if ctx.double_accuracy else "float32"
if ctx.optimize_for_localhost:
cpu_vec = {"nontemporal": True, "assume_aligned": True}
else:
cpu_vec = None
config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3
stencil_str = config_tokens[0]
streaming_pattern = config_tokens[1]
collision_setup = config_tokens[2]
if len(config_tokens) >= 4:
optimize = (config_tokens[3] != 'noopt')
if stencil_str == "d3q27":
stencil = LBStencil(Stencil.D3Q27)
elif stencil_str == "d3q19":
stencil = LBStencil(Stencil.D3Q19)
else:
raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[collision_setup]
q = stencil.Q
dim = stencil.D
assert dim == 3, "This app supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
field_swaps = [(pdfs, pdfs_tmp)]
else:
field_swaps = []
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
accessor = CollideOnlyInplaceAccessor()
accessor.is_inplace = False
field_swaps_stream_only = [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
# LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
if optimize:
collision_rule = insert_fast_divisions(collision_rule)
collision_rule = insert_fast_sqrts(collision_rule)
collision_rule = insert_constants(collision_rule)
collision_rule = insert_zeros(collision_rule)
collision_rule = insert_aliases(collision_rule)
collision_rule = insert_symbol_times_minus_one(collision_rule)
lb_method = collision_rule.method
generate_alternating_lbm_sweep(ctx, 'UniformGridCPU_LbKernel', collision_rule, lbm_config=lbm_config,
lbm_optimisation=lbm_opt, target=ps.Target.CPU,
inner_outer_split=True, field_swaps=field_swaps,
cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method,
density=density_field.center, velocity=velocity_field.center_vector,
pdfs=pdfs,
streaming_pattern=streaming_pattern,
previous_timestep=Timestep.EVEN)
getter_assignments = macroscopic_values_getter(lb_method,
density=density_field, velocity=velocity_field,
pdfs=pdfs,
streaming_pattern=streaming_pattern,
previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU, cpu_openmp=openmp)
# Stream only kernel
generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
target=ps.Target.CPU, cpu_openmp=openmp)
# Boundaries
noslip = NoSlip()
ubb = UBB((0.05, 0, 0), data_type=field_type)
generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_UBB', ubb, lb_method, field_name=pdfs.name,
streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp)
# communication
generate_lb_pack_info(ctx, 'UniformGridCPU_PackInfo', stencil, pdfs,
streaming_pattern=streaming_pattern, target=ps.Target.CPU,
always_generate_separate_classes=True)
infoHeaderParams = {
'stencil': stencil_str,
'streaming_pattern': streaming_pattern,
'collision_setup': collision_setup,
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
stencil_typedefs = {'Stencil_T': stencil,
'CommunicationStencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
# Info header containing correct template definitions for stencil and field
generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
additional_code=info_header.format(**infoHeaderParams))
import os
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys
import sqlite3
from math import prod
# Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 5
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block
return int(time_steps)
ldc_setup = {'Border': [
{'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
{'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
]}
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1),
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1):
if boundary_setup:
init_shear_flow = False
periodic = (0, 0, 0)
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block
self.periodic = periodic
self.time_step_strategy = time_step_strategy
self.omega = omega
self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
self.inner_outer_split = inner_outer_split
self.init_shear_flow = init_shear_flow
self.boundary_setup = boundary_setup
self.warmup_steps = warmup_steps
self.outer_iterations = outer_iterations
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.config_dict = self.config(print_dict=False)
@wlb.member_callback
def config(self, print_dict=True):
from pprint import pformat
config_dict = {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'omega': self.omega,
'warmupSteps': self.warmup_steps,
'outerIterations': self.outer_iterations,
'timeStepStrategy': self.time_step_strategy,
'timesteps': self.timesteps,
'initShearFlow': self.init_shear_flow,
'innerOuterSplit': self.inner_outer_split,
'vtkWriteFrequency': self.vtk_write_frequency,
'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
}
}
if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
result = data
sequenceValuesToScalars(result)
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
storeSingle(result, table_name, DB_FILE)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
# -------------------------------------- Profiling -----------------------------------
def profiling():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running 2 timesteps for profiling")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
cells = (128, 128, 128)
scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
inner_outer_split=(1, 1, 1), timesteps=2,
outer_iterations=1, warmup_steps=0))
# -------------------------------------- Functions trying different parameter sets -----------------------------------
def overlap_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running different communication overlap strategies")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
inner_outer_splits = [(1, 1, 1), (4, 1, 1)]
cells_per_block = [(i, i, i) for i in (16, 32, 64, 128)]
for cell_per_block in cells_per_block:
scenarios.add(Scenario(time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1),
cells_per_block=cell_per_block))
for inner_outer_split in inner_outer_splits:
scenario = Scenario(time_step_strategy='simpleOverlap',
inner_outer_split=inner_outer_split,
cells_per_block=cell_per_block)
scenarios.add(scenario)
def scaling_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
cells_per_block = [(32, 32, 32), (128, 128, 128)]
for cell_per_block in cells_per_block:
scenarios.add(Scenario(time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1),
cells_per_block=cell_per_block))
def single_node_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single Node benchmarks")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
for block_size in block_sizes:
scenario = Scenario(cells_per_block=block_size,
time_step_strategy='kernelOnly',
timesteps=num_time_steps(block_size))
scenarios.add(scenario)
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
wlb.log_info_on_root("")
time_step_strategy = 'simpleOverlap' # 'noOverlap'
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(64, 64, 64),
time_step_strategy=time_step_strategy,
timesteps=101,
outer_iterations=1,
warmup_steps=0,
init_shear_flow=True,
boundary_setup=False,
vtk_write_frequency=100,
remaining_time_logger_frequency=10)
scenarios.add(scenario)
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
# Select the benchmark you want to run
single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
# validation_run()
# scaling_benchmark()
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
foreach(config srt trt smagorinsky mrt entropic_kbc_n4 cumulant )
waLBerla_generate_target_from_python(NAME UniformGridGenerated_${config}
CODEGEN_CFG ${config}
FILE UniformGridGenerated.py
OUT_FILES GenMacroGetter.cpp GenMacroGetter.h GenMacroSetter.cpp GenMacroSetter.h
GenPackInfo.cpp GenPackInfo.h GenPackInfoAAPush.cpp GenPackInfoAAPush.h GenPackInfoAAPull.cpp GenPackInfoAAPull.h
GenLbKernel.cpp GenLbKernel.h GenLbKernelAAEven.cpp GenLbKernelAAEven.h GenLbKernelAAOdd.cpp GenLbKernelAAOdd.h
GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h
GenDefines.h)
waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config}
FILES UniformGridGenerated.cpp
DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui
UniformGridGenerated_${config} python_coupling)
endforeach()
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 64, 64, 64 >;
periodic < 1, 1, 1 >;
}
Parameters
{
timesteps 200; // time steps of one performance measurement
warmupSteps 1; // number of steps to run before measurement starts
outerIterations 4; // how many measurements to conduct
vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled
timeStepMode twoField;
//twoFieldKernelType manualD3Q19;
remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time
directComm 0;
omega 1.8;
shearVelocityMagnitude 0.02;
useGui 0;
}
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/OpenMP.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/vtk/VTKWriter.h"
#include "field/AddToStorage.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "blockforest/communication/UniformDirectScheme.h"
#include "timeloop/all.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/waLBerlaBuildInfo.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "InitShearVelocity.h"
#include "ManualKernels.h"
#include "lbm/lattice_model/D3Q19.h"
#include "GenDefines.h"
#include <iomanip>
using namespace walberla;
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER()
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", real_t(0.08));
const bool directComm = parameters.getParameter<bool>("directComm", false);
auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
return new PdfField_T(storage->getNumberOfXCells(*block),
storage->getNumberOfYCells(*block),
storage->getNumberOfZCells(*block),
uint_t(1),
field::fzyx,
make_shared<field::AllocateAligned<real_t, 64>>());
};
// Creating fields
BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
if( shearVelocityMagnitude > 0 )
initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
for( auto & b : *blocks)
setterKernel(&b);
// Buffered Comm
blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
// Direct Comm
blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated");
std::function<void(IBlock*)> twoFieldKernel;
if( twoFieldKernelType == "generated") {
twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega);
} else if (twoFieldKernelType == "manualGeneric") {
using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
} else if (twoFieldKernelType == "manualD3Q19") {
using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
} else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", "
"valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"")
}
using F = std::function<void()>;
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
if( timeStepMode == "twoField")
{
timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide1" );
timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
<< Sweep( twoFieldKernel, "LB stream & collide2" );
} else if ( timeStepMode == "twoFieldKernelOnly") {
timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
} else if ( timeStepMode == "aa") {
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
<< Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
<< AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
} else if ( timeStepMode == "aaKernelOnly") {
timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
} else {
WALBERLA_ABORT("Invalid value for timeStepMode")
}
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
for(int i=0; i < warmupSteps; ++i )
timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if (remainingTimeLoggerFrequency > 0) {
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
vtkOutput->addCellDataWriter( velWriter );
vtkOutput->addBeforeFunction( [&]()
{ for( auto & b : *blocks)
getterKernel(&b);
} );
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
GUI gui( timeLoop, blocks, argc, argv);
gui.run();
}
else
{
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
auto threads = omp_get_max_threads();
simTimer.start();
timeLoop.run();
simTimer.end();
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
using std::setw;
WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
" procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
" threads: " << threads <<
" direct_comm: " << directComm <<
" time steps: " << timesteps <<
setw(15) << " block size: " << cellsPerBlock <<
" mlups/core: " << int(mlupsPerProcess/ threads) <<
" mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses())
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return 0;
}
import sympy as sp
import pystencils as ps
from lbmpy.creationfunctions import create_lb_update_rule, create_lb_collision_rule
from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
generate_mpidtype_info_from_kernel, generate_info_header
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
omega = sp.symbols('omega')
omega_free = sp.Symbol('omega_free')
omega_fill = sp.symbols('omega_:10')
options_dict = {
'srt': {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': omega,
'compressible': False,
},
'trt': {
'method': 'trt',
'stencil': 'D3Q19',
'compressible': False,
'relaxation_rate': omega,
},
'mrt': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [0.0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
},
'mrt_full': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
omega_fill[3], omega_fill[4], omega_fill[5]],
},
'entropic': {
'method': 'mrt',
'stencil': 'D3Q19',
'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
'entropic': True,
},
'entropic_kbc_n4': {
'method': 'trt_kbc_n4',
'stencil': 'D3Q27',
'compressible': True,
'relaxation_rates': [omega, omega_free],
'entropic': True,
},
'smagorinsky': {
'method': 'srt',
'stencil': 'D3Q19',
'smagorinsky': True,
'relaxation_rate': omega,
},
'cumulant': {
'method': 'cumulant',
'stencil': 'D3Q19',
'compressible': True,
'relaxation_rate': omega,
},
}
with CodeGeneration() as ctx:
common_options = {
'field_name': 'pdfs',
'temporary_field_name': 'pdfs_tmp',
}
opts = {
'two_field_cse_pdfs': False,
'two_field_cse_global': False,
'two_field_split': True,
'two_field_nt_stores': True,
'aa_even_cse_pdfs': False,
'aa_even_cse_global': False,
'aa_even_split': False,
'aa_even_nt_stores': False,
'aa_odd_cse_pdfs': False,
'aa_odd_cse_global': False,
'aa_odd_split': True,
'aa_odd_nt_stores': False,
'compiled_in_boundaries': False,
}
config_name = ctx.config
noopt = False
d3q27 = False
if config_name.endswith('_d3q27'):
d3q27 = True
config_name = config_name[:-len('_d3q27')]
if config_name == '':
config_name = 'trt'
options = options_dict[config_name]
options.update(common_options)
options = options.copy()
if d3q27:
stencil = LBStencil(Stencil.D3Q27)
options['stencil'] = stencil
else:
stencil = LBStencil(options['stencil'])
dtype_string = 'float64' if ctx.double_accuracy else 'float32'
pdfs, velocity_field = ps.fields(f'pdfs({stencil.Q}), velocity(3) : {dtype_string}[3D]', layout='fzyx')
lbm_config = LBMConfig(**options)
lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, split=opts['two_field_split'],
cse_global=opts['two_field_cse_global'], cse_pdfs=opts['two_field_cse_pdfs'])
update_rule_two_field = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
if opts['compiled_in_boundaries']:
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.boundaries.boundaries_in_kernel import update_rule_with_push_boundaries
from collections import OrderedDict
boundaries = OrderedDict((
((1, 0, 0), NoSlip()),
((-1, 0, 0), NoSlip()),
((0, 1, 0), NoSlip()),
((0, -1, 0), NoSlip()),
((0, 0, 1), UBB([0.05, 0, 0])),
((0, 0, -1), NoSlip()),
))
cr_even = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False),
lbm_optimisation=LBMOptimisation(cse_global=opts['aa_even_cse_global'],
cse_pdfs=opts['aa_even_cse_pdfs']))
cr_odd = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False),
lbm_optimisation=LBMOptimisation(cse_global=opts['aa_odd_cse_global'],
cse_pdfs=opts['aa_odd_cse_pdfs']))
update_rule_aa_even = update_rule_with_push_boundaries(cr_even, pdfs, boundaries,
AAEvenTimeStepAccessor, AAOddTimeStepAccessor.read)
update_rule_aa_odd = update_rule_with_push_boundaries(cr_odd, pdfs, boundaries,
AAOddTimeStepAccessor, AAEvenTimeStepAccessor.read)
else:
lbm_opt_even = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_even_split'],
cse_global=opts['aa_even_cse_global'], cse_pdfs=opts['aa_even_cse_pdfs'])
update_rule_aa_even = create_lb_update_rule(lbm_config=LBMConfig(**options,
kernel_type=AAEvenTimeStepAccessor()),
lbm_optimisation=lbm_opt_even)
lbm_opt_odd = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_odd_split'],
cse_global=opts['aa_odd_cse_global'], cse_pdfs=opts['aa_odd_cse_pdfs'])
update_rule_aa_odd = create_lb_update_rule(lbm_config=LBMConfig(**options,
kernel_type=AAOddTimeStepAccessor()),
lbm_optimisation=lbm_opt_odd)
vec = {'assume_aligned': True, 'assume_inner_stride_one': True}
# check if openmp is enabled in cmake
if ctx.openmp:
openmp_enabled = True
else:
openmp_enabled = False
# Sweeps
vec['nontemporal'] = opts['two_field_nt_stores']
generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
cpu_vectorize_info=vec)
vec['nontemporal'] = opts['aa_even_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
cpu_openmp=openmp_enabled, ghost_layers=1)
vec['nontemporal'] = opts['aa_odd_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
cpu_openmp=openmp_enabled, ghost_layers=1)
setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=1.0)
getter_assignments = macroscopic_values_getter(update_rule_two_field.method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=None)
generate_sweep(ctx, 'GenMacroSetter', setter_assignments, cpu_openmp=openmp_enabled)
generate_sweep(ctx, 'GenMacroGetter', getter_assignments, cpu_openmp=openmp_enabled)
# Communication
generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule_two_field,
cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPull', update_rule_aa_odd, kind='pull',
cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push',
cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field)
generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull')
generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push')
additional_code = f"""
const char * infoStencil = "{stencil.name}";
const char * infoConfigName = "{ctx.config}";
const char * optimizationDict = "{str(opts)}";
"""
stencil_typedefs = {'Stencil_T': stencil,
'CommunicationStencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field}
generate_info_header(ctx, "GenDefines.h", stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
additional_code=additional_code)
import math
import os
import operator
import waLBerla as wlb
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
from waLBerla.tools.config import block_decomposition
from functools import reduce
import sqlite3
def prod(seq):
return reduce(operator.mul, seq, 1)
def get_block_decomposition(block_decomposition, num_processes):
bx = by = bz = 1
blocks_per_axis = int(math.log(num_processes, 2))
for i in range(blocks_per_axis):
decomposition_axis = block_decomposition[i % len(block_decomposition)]
if decomposition_axis == 'y':
by *= 2
elif decomposition_axis == 'z':
bz *= 2
elif decomposition_axis == 'x':
bx *= 2
assert (bx * by * bz) == num_processes
return bx, by, bz
def calculate_time_steps(runtime, expected_mlups, domain_size):
cells = prod(domain_size)
time_steps_per_second = expected_mlups * 1e6 / cells
return int(time_steps_per_second * runtime)
def domain_decomposition_func_z(processes, threads, block_size):
return {
'blocks': (1, 1, processes),
'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
}
def domain_decomposition_func_full(processes, threads, block_size):
return {
'blocks': block_decomposition(processes),
'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
}
class BenchmarkScenario:
def __init__(self, block_size=(256, 128, 128), direct_comm=True,
time_step_mode='aa', two_field_kernel_type='generated',
domain_decomposition_func=domain_decomposition_func_z,
db_file_name='uniform_grid_gen.sqlite'):
self.block_size = block_size
self.direct_comm = direct_comm
self.time_step_mode = time_step_mode
self.two_field_kernel_type = two_field_kernel_type
self.domain_decomposition_func = domain_decomposition_func
self.threads = int(os.environ['OMP_NUM_THREADS'])
self.processes = wlb.mpi.numProcesses()
self.db_file_name = db_file_name
@wlb.member_callback
def config(self, **kwargs):
time_steps_for_128_cubed = 10
time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed)
time_steps = max(10, time_steps)
cfg = {
'DomainSetup': {
'periodic': (1, 1, 1),
},
'Parameters': {
'timesteps': time_steps,
'warmupSteps': 2,
'outerIterations': 4,
'vtkWriteFrequency': 0,
'remainingTimeLoggerFrequency': 0,
'omega': 1.6,
'timeStepMode': self.time_step_mode,
'twoFieldKernelType': self.two_field_kernel_type,
'directComm': self.direct_comm,
}
}
cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size))
return cfg
@wlb.member_callback
def results_callback(self, mlupsPerProcess, optimizations, **kwargs):
cfg = self.config()
result = {
'block_size': self.block_size,
'mlups_per_core': mlupsPerProcess / self.threads,
'threads': self.threads,
'processes': self.processes,
'time_step_mode': self.time_step_mode,
'direct_comm': self.direct_comm,
'time_steps': cfg['Parameters']['timesteps'],
'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''),
'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''),
}
optimizations = eval(optimizations)
result.update(optimizations)
result.update(kwargs)
sequenceValuesToScalars(result)
num_tries = 4
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, "runs", self.db_file_name)
storeSingle(result, "runs", self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/4 " + str(e))
def block_size_ok(sc):
block_size = sc.config()['DomainSetup']['cellsPerBlock']
return prod(block_size) * 19 < 2 ** 31
def single_node_benchmark():
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
(1024, 64, 32), (2048, 64, 16),
(64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]:
for direct_comm in (True, False):
for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
if time_step_mode == 'twoField':
for kt in ['generated', 'manualGeneric', 'manualD3Q19']:
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
time_step_mode=time_step_mode, two_field_kernel_type=kt,
domain_decomposition_func=domain_decomposition_func_z
)
if not block_size_ok(sc):
continue
scenarios.add(sc)
else:
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
domain_decomposition_func=domain_decomposition_func_z,
time_step_mode=time_step_mode)
if not block_size_ok(sc):
continue
# scenarios.add(sc)
def single_node_benchmark_small():
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
(1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
(256, 128, 64), (512, 128, 32)]:
for direct_comm in (True, False):
for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode)
if not block_size_ok(sc):
continue
scenarios.add(sc)
def weak_scaling():
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
(1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
(256, 128, 64), (512, 128, 32)]:
for direct_comm in (True, False):
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
domain_decomposition_func=domain_decomposition_func_full)
if not block_size_ok(sc):
continue
scenarios.add(sc)
single_node_benchmark()
add_library(
lodepng
EXCLUDE_FROM_ALL
lodepng.cpp
lodepng.h
lodepng
EXCLUDE_FROM_ALL
lodepng.cpp
lodepng.h
)
target_include_directories(lodepng PUBLIC SYSTEM "${CMAKE_CURRENT_SOURCE_DIR}")
......@@ -10,4 +10,4 @@ if (MSVC)
target_compile_options( lodepng PRIVATE /w )
else ()
target_compile_options( lodepng PRIVATE -w )
endif ()
\ No newline at end of file
endif ()
......@@ -17,7 +17,7 @@ def generate_lb_pack_info(generation_context,
namespace='lbm',
target=Target.CPU,
data_type=None,
cpu_openmp=None,
cpu_openmp=False,
**create_kernel_params):
"""Generates waLBerla MPI PackInfos for an LBM kernel, based on a given method
and streaming pattern. For in-place streaming patterns, two PackInfos are generated;
......
......@@ -160,7 +160,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
def generate_pack_info_for_field(generation_context, class_name: str, field: Field,
direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None,
operator=None, gl_to_inner=False,
target=Target.CPU, data_type=None, cpu_openmp=None,
target=Target.CPU, data_type=None, cpu_openmp=False,
**create_kernel_params):
"""Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements.
......@@ -188,7 +188,7 @@ def generate_pack_info_for_field(generation_context, class_name: str, field: Fie
def generate_pack_info_from_kernel(generation_context, class_name: str, assignments: Sequence[Assignment],
kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=None,
kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False,
**create_kernel_params):
"""Generates a waLBerla GPU PackInfo from a (pull) kernel.
......@@ -241,7 +241,7 @@ def generate_pack_info_from_kernel(generation_context, class_name: str, assignme
def generate_pack_info(generation_context, class_name: str,
directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]],
namespace='pystencils', operator=None, gl_to_inner=False,
target=Target.CPU, data_type=None, cpu_openmp=None,
target=Target.CPU, data_type=None, cpu_openmp=False,
**create_kernel_params):
"""Generates a waLBerla GPU PackInfo
......@@ -258,6 +258,9 @@ def generate_pack_info(generation_context, class_name: str,
cpu_openmp: if loops should use openMP or not.
**create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
"""
if cpu_openmp:
raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus "
"additionally parallelising each kernel is not supported.")
items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()]
items = sorted(items, key=lambda e: e[0])
directions_to_pack_terms = OrderedDict(items)
......@@ -286,7 +289,7 @@ def generate_pack_info(generation_context, class_name: str,
if len(data_types) == 0:
raise ValueError("No fields to pack!")
if len(data_types) != 1:
err_detail = "\n".join(" - {} [{}]".format(f.name, f.dtype) for f in fields_accessed)
err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed)
raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail)
dtype = data_types.pop()
......
......@@ -44,7 +44,6 @@ public:
unpackData( receiver, stencil::inverseDir[dir], rBuffer );
}
private:
void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const {
const auto dataSize = size(dir, sender);
pack(dir, outBuffer.forward(dataSize), const_cast<IBlock*>(sender));
......@@ -54,6 +53,7 @@ private:
void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block) const;
uint_t size (stencil::Direction dir, const IBlock * block) const;
private:
{{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}}
};
......
......@@ -102,7 +102,7 @@ void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_param
{
{{kernel|generate_block_data_to_field_extraction|indent(4)}}
if( layers_.size() == 0 )
if( layers_.empty() )
{
CellInterval ci;
......
......@@ -28,8 +28,12 @@ class PackinfoGenTest(unittest.TestCase):
for file_name_to_test in ('PI1.cpp', 'PI2.cpp'):
file_to_test = ctx.files[file_name_to_test]
# For Packing kernels it is better to not have OpenMP in the code because the packing kernels
# themselves are launched in an OpenMP environment. Still it could be forced but setting
# openmp to True in generate_pack_info_from_kernel
if openmp:
assert '#pragma omp parallel' in file_to_test
assert '#pragma omp parallel' not in file_to_test
if da:
assert 'float ' not in file_to_test
......
......@@ -82,5 +82,3 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
} // namespace lbm
} // namespace walberla
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file CombinedInPlaceCpuPackInfo.h
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#pragma once
#define IS_EVEN(x) (((x) &1) ^ 1)
#include "communication/UniformPackInfo.h"
#include "lbm/inplace_streaming/TimestepTracker.h"
namespace walberla
{
namespace lbm
{
template< typename EvenPackInfo, typename OddPackInfo >
class CombinedInPlaceCpuPackInfo : public ::walberla::communication::UniformPackInfo
{
public:
template< typename... Args >
CombinedInPlaceCpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args)
: tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...)
{}
~CombinedInPlaceCpuPackInfo() override = default;
bool constantDataExchange() const override { return true; }
bool threadsafeReceiving() const override { return true; }
void unpackData(IBlock* receiver, stencil::Direction dir, mpi::RecvBuffer& buffer) override
{
if (IS_EVEN(tracker_->getCounter()))
{
return evenPackInfo_.unpackData(receiver, dir, buffer);
}
else
{
return oddPackInfo_.unpackData(receiver, dir, buffer);
}
}
void communicateLocal(const IBlock* sender, IBlock* receiver, stencil::Direction dir) override
{
if (IS_EVEN(tracker_->getCounter()))
{
return evenPackInfo_.communicateLocal(sender, receiver, dir);
}
else
{
return oddPackInfo_.communicateLocal(sender, receiver, dir);
}
}
void packDataImpl(const IBlock* sender, stencil::Direction dir, mpi::SendBuffer& outBuffer) const override
{
if (IS_EVEN(tracker_->getCounter()))
{
return evenPackInfo_.packDataImpl(sender, dir, outBuffer);
}
else
{
return oddPackInfo_.packDataImpl(sender, dir, outBuffer);
}
}
void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
{
if (IS_EVEN(tracker_->getCounter()))
{
evenPackInfo_.pack(dir, buffer, block);
}
else
{
oddPackInfo_.pack(dir, buffer, block);
}
}
void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const
{
if (IS_EVEN(tracker_->getCounter()))
{
evenPackInfo_.unpack(dir, buffer, block);
}
else
{
oddPackInfo_.unpack(dir, buffer, block);
}
}
uint_t size(stencil::Direction dir, IBlock* block) const
{
if (IS_EVEN(tracker_->getCounter()))
{
return evenPackInfo_.size(dir, block);
}
else
{
return oddPackInfo_.size(dir, block);
}
}
private:
const std::shared_ptr< lbm::TimestepTracker >& tracker_;
EvenPackInfo evenPackInfo_;
OddPackInfo oddPackInfo_;
};
} // namespace lbm
} // namespace walberla
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment