Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 121-buffersystem-receiver-info-without-sender-ranks
  • 3-stable
  • 4-stable
  • AddaptTypeSystem
  • CMakeCodeGenPartTwo
  • ChannelFlow
  • CoVortex
  • CodegenForRefinement
  • CommunicationGPUBenchmark
  • ComnbinedGPUPackinfo
  • ExportCudaDeviceSelection
  • FixSinglePrecisionProblems
  • FlagFieldExample
  • FlowAroundSphere
  • FreeSurface
  • GPURefineTest
  • GPURefinement
  • GPURefinementImprovement
  • HRR
  • HydroPressure
  • IBC
  • InterpolationBC
  • Italy
  • LDC
  • Lagoon
  • LeesEdwards
  • ListLBM
  • NewChannelBenchmark
  • Remove_fSize_from_templates
  • SphereMovie
  • TGA
  • TaylorBubble
  • TurbulentChannel
  • UpgradePystencils
  • VTKUnstructured
  • clang11
  • develop
  • develop2
  • fluidizedbed_showcase
  • master
  • phaseField
  • phasefield-drop
  • porous
  • porousHeat
  • remiPorous
  • s2a
  • setup_walberla_codegen
  • vbondmodel_integrated
  • vbondmodel_isotropic
  • v3.1
  • v3.2
  • v3.3
  • v4.0dev
  • v4.1
  • v4.2
  • v5.0dev
56 results

Target

Select target project
  • castellsc/walberla
  • le45zyci/walberla
  • el38efib/walberla
  • sudesh.rathnayake/walberla
  • hoenig/walberla
  • Bindgen/walberla
  • rahil.doshi/walberla
  • em73etav/walberla
  • walberla/walberla
  • ArashPartow/walberla
  • jarmatz/walberla
  • ec93ujoh/walberla
  • jbadwaik/walberla
  • ravi.k.ayyala/walberla
  • ProjectPhysX/walberla
  • ob28imeq/walberla
  • shellshocked2003/walberla
  • stewart/walberla
  • behzad.safaei/walberla
  • schruff/walberla
  • loreson/walberla
  • Novermars/walberla
  • itischler/walberla
  • holzer/walberla
  • da15siwa/walberla
  • he66coqe/walberla
  • jngrad/walberla
  • uq60ifih/walberla
  • ostanin/walberla
  • bauer/walberla
  • zy79zopo/walberla
  • jonas_schmitt/walberla
  • po60nani/walberla
  • ro36vugi/walberla
  • fweik/walberla
  • ab04unyc/walberla
  • yw25ynew/walberla
  • ig38otak/walberla
  • RudolfWeeber/walberla
39 results
Select Git revision
  • 121-buffersystem-receiver-info-without-sender-ranks
  • 128-some-tests-are-not-active
  • 146-cuda-gcc-config-warning
  • 3-stable
  • 4-stable
  • 5-stable
  • 6-stable
  • 7-stable
  • 727-refactor-sqlExport
  • AtomicAdd_for_CUDA_compute_capabilities<6.0
  • ChargedParticles
  • CodegenForRefinement
  • GeneratedOutflowBC
  • RayleighBernardConvection
  • Remove_fSize_from_templates
  • UpdateGPUBenchmark
  • UpdatePhaseField
  • angersbach/coding-day-01-09
  • antidunes-visualization
  • bam_piping_erosion
  • benchmark_sqlite_modify
  • change-default-layout-fzyx
  • clang-tidy
  • clang11
  • clang_tidy2
  • cmake_cleanup
  • cnt_app
  • codegen-update
  • coding-day-01-09-mesh
  • coupling_tutorial
  • doshi/coding-day-01-09
  • externalize_dependencies
  • fix_nvcc_compiler_warnings
  • fluidizedbed_showcase
  • hip-ShiftedPeriodicity
  • kajol/coding-day
  • kemmler/particle_coupling_GPU
  • lbmpy-kernel-comparison
  • master
  • plewinski/fix-Guo-force-model-TRT-MRT
  • pystencils2.0-adoption
  • rangersbach/doxygen_style
  • ravi/coding-day
  • ravi/material_transport
  • setup_walberla_codegen
  • suction_bucket
  • suffa/NorthWind
  • suffa/NorthWind_refined
  • suffa/SYCL
  • suffa/Sparse
  • suffa/compact_interpolation
  • suffa/fix_2D_force_on_boundary
  • suffa/integrate_moving_geo
  • suffa/offline_fetch_content
  • suffa/psm_lbm_package
  • thermalFreeSurfaceLBM
  • thoennes/cusotm-mpi-reduce-function
  • use-correct-codegen-data-type
  • viscLDCwithFSLBM
  • v3.1
  • v3.2
  • v3.3
  • v4.0dev
  • v4.1
  • v4.2
  • v5.0dev
  • v5.1
  • v6.0dev
  • v6.1
  • v7.0dev
  • v7.1
71 results
Show changes
Showing
with 3024 additions and 298 deletions
......@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
else ()
waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
......@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
import sys
from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
"""Checks if a single block of given size fits into GPU memory"""
......@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s
class Scenario:
def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
def __init__(self, time_step_strategy,
cuda_block_size,
cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = 0
......@@ -89,6 +96,14 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data])
......@@ -101,43 +116,19 @@ class Scenario:
def benchmark():
scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_size = (256, 256, 256)
block_size = (320, 320, 320)
cuda_enabled_mpi = True
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
else:
scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
scenarios.add(Scenario(time_step_strategy='normal',
cuda_block_size=(128, 1, 1),
cells_per_block=block_size,
cuda_enabled_mpi=cuda_enabled_mpi))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2),
(32, 8, 1),
(16, 16, 1)]
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for cuda_block in cuda_blocks:
for block_size in block_sizes:
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
continue
scenario = Scenario(time_step_strategy=time_step_strategy,
cuda_block_size=cuda_block,
cells_per_block=block_size)
scenarios.add(scenario)
# benchmark()
kernel_benchmark()
benchmark()
......@@ -29,6 +29,7 @@
#include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
......@@ -78,14 +79,10 @@ int main(int argc, char** argv)
logging::configureLogging(config);
shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
......@@ -102,6 +99,7 @@ int main(int argc, char** argv)
gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
BlockDataID phase_field_gpu =
gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
#else
BlockDataID lb_phase_field =
field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
......@@ -109,6 +107,7 @@ int main(int argc, char** argv)
field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
#endif
if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
......@@ -139,47 +138,80 @@ int main(int argc, char** argv)
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
gpuBlockSize[1], gpuBlockSize[2]);
#else
pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
#endif
// add communication
#if defined(WALBERLA_BUILD_WITH_CUDA)
const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
auto Comm_velocity_based_distributions =
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
const int streamLowPriority = 0;
const int streamHighPriority = 0;
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
auto Comm_phase_field_distributions =
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_phase_field_distributions =
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
#else
auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#else
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
auto generatedPackInfo_phase_field_distributions =
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< PhaseField_T >(phase_field);
auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#endif
BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
......@@ -201,99 +233,37 @@ int main(int argc, char** argv)
init_h(&block);
init_g(&block);
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
}
SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
int const streamLowPriority = 0;
int const streamHighPriority = 0;
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
#endif
timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
<< Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
<< AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions->startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block, defaultStream);
Comm_velocity_based_distributions->wait();
timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
<< Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
<< AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
Comm_phase_field_distributions->startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block, defaultStream);
Comm_phase_field_distributions->wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#else
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block);
Comm_velocity_based_distributions.wait();
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block);
Comm_phase_field_distributions.wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#endif
std::function< void() > timeStep;
if (timeStepStrategy == "phase_only")
{
timeStep = std::function< void() >(phase_only);
WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
}
else if (timeStepStrategy == "hydro_only")
{
timeStep = std::function< void() >(hydro_only);
WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
}
else if (timeStepStrategy == "kernel_only")
{
timeStep = std::function< void() >(without_comm);
WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
}
else
{
timeStep = std::function< void() >(normalTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
}
timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
<< Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
<< AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
<< Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
<< AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
// remaining time logger
if (remainingTimeLoggerFrequency > 0)
timeLoop->addFuncAfterTimeStep(
timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
"remaining time logger");
timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
#endif
uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 1)
......@@ -307,40 +277,60 @@ int main(int argc, char** argv)
auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
vtkOutput->addCellDataWriter(phaseWriter);
timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
fluidCells();
WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
for (uint_t i = 0; i < warmupSteps; ++i)
timeLoop->singleStep();
timeloop.singleStep();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
timeLoop->setCurrentTimeStepToZero();
timeloop.setCurrentTimeStepToZero();
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
WcTimingPool timeloopTiming;
WcTimer simTimer;
#if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
#endif
simTimer.start();
timeLoop->run();
timeloop.run(timeloopTiming);
#if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
#endif
WALBERLA_MPI_BARRIER()
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
auto time = real_c(simTimer.last());
auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
double time = simTimer.max();
WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
performance.logResultOnRoot(timesteps, time);
const auto reducedTimeloopTiming = timeloopTiming.getReduced();
WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
#if defined(WALBERLA_BUILD_WITH_CUDA)
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
#endif
// Call Python function to report results
pythonCallbackResults();
......
......@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
stencil_phase = LBStencil(Stencil.D3Q15)
stencil_hydro = LBStencil(Stencil.D3Q27)
stencil_hydro = LBStencil(Stencil.D3Q19)
assert (stencil_phase.D == stencil_hydro.D)
########################
......@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
delta_equilibrium=False,
force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
weighted=True, relaxation_rates=rates,
output={'density': C_tmp}, kernel_type='stream_pull_collide')
output={'density': C_tmp})
method_phase = create_lb_method(lbm_config=lbm_config_phase)
lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
weighted=True, relaxation_rate=omega,
force=sp.symbols(f"F_:{stencil_hydro.D}"),
output={'velocity': u}, kernel_type='collide_stream_push')
output={'velocity': u})
method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
# create the kernels for the initialization of the g and h field
......@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
sweep_params = {'block_size': sweep_block_size}
stencil_typedefs = {'Stencil_phase_T': stencil_phase,
'Stencil_hydro_T': stencil_hydro}
'Stencil_hydro_T': stencil_hydro,
'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
field_typedefs = {'PdfField_phase_T': h,
'PdfField_hydro_T': g,
'VelocityField_T': u,
......@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)],
field_swaps=[(h, h_tmp)],
inner_outer_split=True,
cpu_vectorize_info=cpu_vec,
target=Target.CPU)
......@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.CPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.CPU)
streaming_pattern='pull', target=Target.CPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)
......@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
g_updates, target=Target.GPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)],
field_swaps=[(h, h_tmp)],
target=Target.GPU,
gpu_indexing_params=sweep_params,
varying_parameters=vp)
......@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.GPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.GPU)
streaming_pattern='pull', target=Target.GPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)
......
......@@ -37,6 +37,7 @@
#include "core/debug/CheckFunctions.h"
#include "core/debug/Debug.h"
#include "core/debug/TestSubsystem.h"
#include "core/logging/Initialization.h"
#include "core/logging/Logging.h"
#include "core/math/Constants.h"
#include "core/math/Sample.h"
......@@ -2889,10 +2890,9 @@ int main( int argc, char **argv )
" ('fzyx') data layout!" << std::endl;
}
return EXIT_SUCCESS;
}
}
logging::Logging::printHeaderOnStream();
//WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }
#ifdef _OPENMP
if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )
......@@ -2903,6 +2903,7 @@ int main( int argc, char **argv )
// open configuration file
shared_ptr< Config > config = make_shared< Config >();
logging::configureLogging(config);
config->readParameterFile( argv[1] );
Config::BlockHandle configBlock = config->getBlock( "SchaeferTurek" );
......
......@@ -29,31 +29,31 @@ SchaeferTurek
//////////////////////////////
memoryPerCell 153; // in bytes
processMemoryLimit 2048; // in MiB !
processMemoryLimit 4048; // in MiB !
yzBlocks 1; // blocks in y- and z-direction [2D: only 1 block in z-direction]
yzBlocks 4; // blocks in y- and z-direction [2D: only 1 block in z-direction]
// The number of blocks in x-direction is calculated from: H, L, yzBlocks, xCells, and yzCells.
xCells 40; // number of cells in x-direction for each block
yzCells 40; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
xCells 64; // number of cells in x-direction for each block
yzCells 16; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
H 0.41; // [m]
L 20.0; // [m] (2.5 [3D], 2.2 [2D])
L 2.5; // [m] (2.5 [3D], 2.2 [2D])
strictlyObeyL false;//true; // true = outflow will be located exactly at "L", false = outflow might be slightly further away than "L"
cylinderxPosition 0.5; // [m] (0.5 [3D], 0.2 [2D])
cylinderyPosition 0.2; // [m]
cylinderRadius 0.18; // [m]
cylinderRadius 0.05; // [m]
circularCrossSection true; // true = obstacle is a true cylinder, false = obstacle is a box
kinViscosity 0.001; // [m^2/s]
rho 1; // [kg/m^3]
inflowVelocity 2; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
raisingTime 0.3; // [s] (0 == immediatelly full velocity)
inflowVelocity 0.45; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
raisingTime 0; // [s] (0 == immediatelly full velocity)
sinPeriod 0; // [s] (0 == no sinus term)
obstacleBoundary 0; // 0 = staircase, 1 = curved
obstacleBoundary 1; // 0 = staircase, 1 = curved
outletType 1; // 0 = pressure, 1 = outlet (2/1), 2 = outlet (4/3)
initWithVelocity false; // at the beginning of the simulation everything is set to: ( 4 * inflowVelocity ) / 9 [3D] or ( 2 * inflowVelocity ) / 3 [2D]
......@@ -63,7 +63,7 @@ SchaeferTurek
nbrOfEvaluationPointsForCoefficientExtremas 100;
evaluatePressure true;
evaluatePressure false;
pAlpha < 0.45, 0.2, 0.205 >; // points for evaluating
pOmega < 0.55, 0.2, 0.205 >; // the pressure difference
......@@ -74,9 +74,9 @@ SchaeferTurek
// WHERE TO REFINE ? //
///////////////////////
useCylinderForRefinement false;
cylinderRefinementLevel 2; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
cylinderRefinementBuffer 0.05; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
useCylinderForRefinement true;
cylinderRefinementLevel 0; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
cylinderRefinementBuffer 0.0; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
AABBRefinementSelection
{
......@@ -149,11 +149,11 @@ SchaeferTurek
////////////////////////////
//minSimulationTime 10; // in [s] - ATTENTION: _IF_ 'minSimulationTime' is specified, 'outerTimeSteps' is adapted accordingly!
outerTimeSteps 2; // total number of time steps = outerTimeSteps * innerTimeSteps
innerTimeSteps 5000; // For each outer loop, performance data is logged.
outerTimeSteps 1; // total number of time steps = outerTimeSteps * innerTimeSteps
innerTimeSteps 20001; // For each outer loop, performance data is logged.
evaluationCheckFrequency 0; // 0 = disable evaluation
evaluationLogToStream false;
evaluationCheckFrequency 100; // 0 = disable evaluation
evaluationLogToStream true;
evaluationLogToFile true;
evaluationFilename SchaeferTurek.txt;
......@@ -175,7 +175,7 @@ SchaeferTurek
vtkMPIIO true;
}
remainingTimeLoggerFrequency 5; // in seconds
remainingTimeLoggerFrequency 10; // in seconds
vtkBeforeTimeStep true; // false = at the end of time step
......@@ -243,18 +243,20 @@ SchaeferTurek
checkStrouhalNbrDiscreteDUpperBound 1E6; // upper bound for the Strouhal number (only checked if "evaluateStrouhal" is set to true - "discrete" cylinder diameter used for evaluation)
}
Logging
{
logLevel info; // info progress detail tracing
}
VTK
{
/*
fluid_field
{
baseFolder vtk;
//initialWriteCallsToSkip 55130;
writeFrequency 304;//102;
ghostLayers 1;
ghostLayers 0;
//AABB_filter_0 {
// min < 0, 0, 0.2049999 >;
......@@ -276,18 +278,16 @@ VTK
}
writers {
//VelocityFromPDF;
VelocityFromPDF;
VelocityMagnitudeFromPDF;
//DensityFromPDF;
DensityFromPDF;
//NonEquPart;
//PDF;
}
incompatibleBlockStates empty;
}
*/
/*
flag_field
{
baseFolder vtk;
......@@ -302,8 +302,7 @@ VTK
incompatibleBlockStates empty;
}
*/
/*
domain_decomposition
{
......
......@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist)
foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
FILE UniformGridCPU.py
......
......@@ -235,12 +235,16 @@ int main(int argc, char** argv)
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("vectorised", vectorised);
pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results
......
......@@ -3,17 +3,12 @@ from dataclasses import replace
import sympy as sp
import pystencils as ps
from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
insert_symbol_times_minus_one
from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
from lbmpy.enums import Method, Stencil
from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
from lbmpy.enums import Method, Stencil, SubgridScaleModel
from lbmpy.moments import get_default_moment_set_for_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
......@@ -21,48 +16,53 @@ from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
omega = sp.symbols('omega')
omega_free = sp.Symbol('omega_free')
# best configs in terms of FLOPS
options_dict = {
'srt': {
'method': Method.SRT,
'relaxation_rate': omega,
'compressible': True,
'compressible': False,
},
'trt': {
'method': Method.TRT,
'relaxation_rate': omega,
'compressible': True,
'compressible': False,
},
'r-w-mrt': {
'mrt': {
'method': Method.MRT,
'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'compressible': True,
'compressible': False,
},
'w-mrt': {
'mrt-overrelax': {
'method': Method.MRT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
'compressible': False,
},
'r-cm': {
'central': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rate': omega,
'compressible': True,
},
'cm': {
'central-overrelax': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
},
'r-k': {
'method': Method.CUMULANT,
'cumulant': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega,
'compressible': True,
},
'k': {
'method': Method.CUMULANT,
'cumulant-overrelax': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': {
'method': Method.TRT_KBC_N4,
'compressible': True,
......@@ -73,7 +73,12 @@ options_dict = {
},
'smagorinsky': {
'method': Method.SRT,
'smagorinsky': False,
'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'relaxation_rate': omega,
},
'qr': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.QR,
'relaxation_rate': omega,
}
}
......@@ -83,6 +88,8 @@ info_header = """
const char * infoStencil = "{stencil}";
const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool vectorised = {vec};
const bool nontemporal = {nt_stores};
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
......@@ -90,10 +97,15 @@ const bool infoCsePdfs = {cse_pdfs};
with CodeGeneration() as ctx:
openmp = True if ctx.openmp else False
field_type = "float64" if ctx.double_accuracy else "float32"
if ctx.optimize_for_localhost:
cpu_vec = {"nontemporal": False, "assume_aligned": True}
else:
cpu_vec = None
# This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
# only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
# gains especially for the pull scheme on skylake architectures
base_pointer_spec = None # [['spatialInner0'], ['spatialInner1']]
# cpu_vec = {"instruction_set": "best", "nontemporal": False,
# "assume_aligned": True, 'assume_sufficient_line_padding': True}
cpu_vec = {"instruction_set": None}
nt_stores = False
config_tokens = ctx.config.split('_')
......@@ -110,7 +122,6 @@ with CodeGeneration() as ctx:
raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[collision_setup]
assert stencil.D == 3, "This application supports only three-dimensional stencils"
......@@ -121,23 +132,20 @@ with CodeGeneration() as ctx:
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
# This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
# not seperated. This is done to get a fair comparison with the monomial cumulants.
if lbm_config.method == Method.CENTRAL_MOMENT:
lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
field_swaps = [(pdfs, pdfs_tmp)]
else:
field_swaps = []
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
# accessor = CollideOnlyInplaceAccessor()
accessor = get_accessor(streaming_pattern, Timestep.EVEN)
#accessor.is_inplace = False
field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor)
# This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
# the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
# typically remove the copy of the center PDF which results in an overestimation of the maximum performance
stream_only_kernel = []
for i in range(stencil.Q):
stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
# LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
......@@ -152,17 +160,26 @@ with CodeGeneration() as ctx:
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields,
cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
base_pointer_specification=base_pointer_spec)
# Stream only kernel
cpu_vec_stream = None
if ctx.optimize_for_localhost:
cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
"assume_aligned": True, 'assume_sufficient_line_padding': True,
"assume_inner_stride_one": True}
generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
field_swaps=field_swaps_stream_only,
target=ps.Target.CPU, cpu_openmp=openmp)
target=ps.Target.CPU, cpu_openmp=openmp,
cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])
infoHeaderParams = {
'stencil': stencil_str,
'streaming_pattern': streaming_pattern,
'collision_setup': collision_setup,
'vec': int(True if cpu_vec else False),
'nt_stores': int(nt_stores),
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
......
......@@ -4,13 +4,26 @@ from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys
import sqlite3
from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 10
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
......@@ -35,7 +48,7 @@ class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1):
vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
if boundary_setup:
init_shear_flow = False
......@@ -58,6 +71,7 @@ class Scenario:
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
......@@ -101,6 +115,15 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
......@@ -111,8 +134,8 @@ class Scenario:
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
storeSingle(result, table_name, DB_FILE)
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
......@@ -156,18 +179,38 @@ def overlap_benchmark():
scenarios.add(scenario)
def scaling_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark")
def weak_scaling_benchmark():
wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
cells_per_block = [(32, 32, 32), (128, 128, 128)]
for cell_per_block in cells_per_block:
scenarios.add(Scenario(time_step_strategy='noOverlap',
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(time_step_strategy=t,
inner_outer_split=(1, 1, 1),
cells_per_block=cell_per_block))
cells_per_block=(WeakX, WeakY, WeakZ),
boundary_setup=True,
outer_iterations=1,
db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
def strong_scaling_benchmark():
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
time_step_strategy=t,
outer_iterations=1,
timesteps=10,
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_node_benchmark():
......@@ -176,13 +219,11 @@ def single_node_benchmark():
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
for block_size in block_sizes:
scenario = Scenario(cells_per_block=block_size,
time_step_strategy='kernelOnly',
outer_iterations=1,
timesteps=num_time_steps(block_size))
scenarios.add(scenario)
scenario = Scenario(cells_per_block=(128, 128, 128),
time_step_strategy='kernelOnly',
outer_iterations=1,
timesteps=10)
scenarios.add(scenario)
def validation_run():
......@@ -211,5 +252,15 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
validation_run()
# validation_run()
# scaling_benchmark()
if BENCHMARK == 0:
single_node_benchmark()
elif BENCHMARK == 1:
weak_scaling_benchmark()
elif BENCHMARK == 2:
strong_scaling_benchmark()
else:
validation_run()
......@@ -3,13 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist)
foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
......
......@@ -136,7 +136,7 @@ int main(int argc, char** argv)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// LB SWEEPS AND BOUNDARY HANDLING ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);
// Boundaries
const FlagUID fluidFlagUID("Fluid");
......@@ -264,6 +264,13 @@ int main(int argc, char** argv)
python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
......
......@@ -4,17 +4,16 @@ import pystencils as ps
from dataclasses import replace
from pystencils import Assignment
from pystencils.typing import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.moments import get_default_moment_set_for_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from lbmpy.fieldaccess import *
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
......@@ -74,6 +73,12 @@ options_dict = {
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': {
'method': Method.TRT_KBC_N4,
'compressible': True,
......@@ -84,7 +89,12 @@ options_dict = {
},
'smagorinsky': {
'method': Method.SRT,
'smagorinsky': False,
'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'relaxation_rate': omega,
},
'qr': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.QR,
'relaxation_rate': omega,
}
}
......@@ -101,7 +111,8 @@ const bool infoCsePdfs = {cse_pdfs};
optimize = True
with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
pdf_data_type = "float64"
field_data_type = "float64"
config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3
......@@ -124,8 +135,8 @@ with CodeGeneration() as ctx:
options = options_dict[collision_setup]
assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
......@@ -140,13 +151,12 @@ with CodeGeneration() as ctx:
else:
field_swaps = []
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
accessor = CollideOnlyInplaceAccessor()
accessor.is_inplace = False
field_swaps_stream_only = [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
# This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
# the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
# typically remove the copy of the center PDF which results in an overestimation of the maximum performance
stream_only_kernel = []
for i in range(stencil.Q):
stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
# LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
......@@ -158,9 +168,10 @@ with CodeGeneration() as ctx:
lb_method = collision_rule.method
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip())
boundary_object=NoSlip(), field_data_type=pdf_data_type)
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([0.05, 0, 0], data_type=field_type))
boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
field_data_type=pdf_data_type)
generate_lbm_package(ctx, name="UniformGridGPU",
collision_rule=collision_rule,
......@@ -168,12 +179,12 @@ with CodeGeneration() as ctx:
nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields,
target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
data_type=field_data_type, pdfs_data_type=pdf_data_type,
max_threads=max_threads)
# Stream only kernel
vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')]
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
max_threads=max_threads)
infoHeaderParams = {
......
......@@ -6,11 +6,25 @@ import sys
import sqlite3
from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per GPU
# if double as many cells are on the GPU, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 1000
DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
BASE_CONFIG = {
'DomainSetup': {
......@@ -39,6 +53,8 @@ ldc_setup = {'Border': [
def num_time_steps(block_size, time_steps_for_128_block=200):
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block
if time_steps < 10:
time_steps = 10
return int(time_steps)
......@@ -61,13 +77,13 @@ class Scenario:
inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1,
additional_info=None):
additional_info=None, blocks=None, db_file_name=None):
if boundary_setup:
init_shear_flow = False
periodic = (0, 0, 0)
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block
self.periodic = periodic
......@@ -85,6 +101,7 @@ class Scenario:
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
self.additional_info = additional_info
......@@ -97,7 +114,6 @@ class Scenario:
'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
'oneBlockPerProcess': True
},
'Parameters': {
'omega': self.omega,
......@@ -115,7 +131,6 @@ class Scenario:
'Logging': {
'logLevel': 'info', # info progress detail tracing
}
}
if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup
......@@ -140,6 +155,15 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
......@@ -150,8 +174,8 @@ class Scenario:
table_name = table_name.replace("-", "_") # - not allowed for table name would lead to syntax error
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
storeSingle(result, table_name, DB_FILE)
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
......@@ -200,12 +224,70 @@ def overlap_benchmark():
scenarios.add(scenario)
def no_overlap_scaling(cuda_enabled_mpi=False):
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark without communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
# no overlap
scenarios.add(Scenario(cells_per_block=(256, 256, 256),
cuda_blocks=(128, 1, 1),
time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1))
def weak_scaling_overlap(cuda_enabled_mpi=False):
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark with communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
# overlap
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
cuda_blocks=(128, 1, 1),
time_step_strategy=t,
inner_outer_split=(64, 64, 64),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1,
boundary_setup=True,
db_file_name="weakScalingUniformGrid.sqlite3"))
def strong_scaling_overlap(cuda_enabled_mpi=False):
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
# overlap
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
cuda_blocks=(128, 1, 1),
time_step_strategy=t,
inner_outer_split=(1, 1, 1),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1,
timesteps=50,
blocks=blocks,
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_gpu_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single GPU benchmarks")
wlb.log_info_on_root("")
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
gpu_mem = gpu_mem_gb * (2 ** 30)
gpu_type = os.environ.get('GPU_TYPE')
......@@ -214,12 +296,8 @@ def single_gpu_benchmark():
additional_info['gpu_type'] = gpu_type
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (32, 64, 128, 256)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 8, 1), (64, 8, 1),
(32, 16, 1)]
block_sizes = [(i, i, i) for i in (128, 256, 320)]
cuda_blocks = [(128, 1, 1), ]
for block_size in block_sizes:
for cuda_block_size in cuda_blocks:
# cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
......@@ -266,4 +344,14 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
validation_run()
# validation_run()
if BENCHMARK == 0:
single_gpu_benchmark()
elif BENCHMARK == 1:
weak_scaling_overlap(True)
elif BENCHMARK == 2:
strong_scaling_overlap(True)
else:
validation_run()
......@@ -17,7 +17,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES} "-Wl,-no-whole-archive" )
endif()
add_library( walberla_cpp SHARED PythonModule.cpp )
add_library( walberla_cpp SHARED PythonModule.cpp
../showcases/FlowAroundSphere/FlowAroundSphere.cpp)
target_link_libraries( walberla_cpp ${WALBERLA_LINK_LIBRARIES_KEYWORD} ${pythonModules} ${SERVICE_LIBS} )
......
......@@ -10,6 +10,11 @@ add_subdirectory( PegIntoSphereBed )
if ( WALBERLA_BUILD_WITH_CODEGEN)
add_subdirectory( Antidunes )
add_subdirectory( FlowAroundSphere )
add_subdirectory( FlowAroundSphereCPU )
add_subdirectory( FlowAroundCylinder )
add_subdirectory( Channel )
add_subdirectory( TaylorGreenVortex )
if (WALBERLA_BUILD_WITH_PYTHON)
add_subdirectory( PhaseFieldAllenCahn )
......
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_generate_target_from_python(NAME ChannelGenerated
FILE Channel.py
OUT_FILES ChannelStorageSpecification.h ChannelStorageSpecification.cpp
ChannelSweepCollection.h ChannelSweepCollection.cpp
NoSlip.h NoSlip.cpp
UBB.h UBB.cpp
ChannelBoundaryCollection.h
ChannelHeader.h)
waLBerla_add_executable ( NAME Channel
FILES Channel.cpp
DEPENDS ChannelGenerated blockforest core field geometry lbm_generated timeloop )
This diff is collapsed.
Parameters
{
omega 1.4;
timesteps 10;
maxLatticeVelocity 0.01;
remainingTimeLoggerFrequency 0; // in seconds
vtkWriteFrequency 0;
}
DomainSetup
{
blocks < 2, 2, 1 >;
cellsPerBlock < 4, 4, 4 >;
periodic < 1, 0, 1 >;
refinementLevels 1;
numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
}
Boundaries
{
Border { direction S; walldistance -1; flag NoSlip; }
Border { direction N; walldistance -1; flag UBB; }
}
VTKWriter
{
vtkWriteFrequency 1;
velocity true;
density true;
averageFields true;
flag false;
writeOnlySlice false;
amrFileFormat false;
oneFilePerProcess false;
}
Logging
{
logLevel info; // info progress detail tracing
writeSetupForestAndReturn false;
remainingTimeLoggerFrequency 60; // in seconds
}
Evaluation
{
evaluationCheckFrequency 100;
logToStream true;
logToFile true;
filename Channel.txt;
}
import sympy as sp
from pystencils import Target
from pystencils import fields
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
from pystencils_walberla import CodeGeneration, generate_info_header
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
import warnings
warnings.filterwarnings("ignore")
with CodeGeneration() as ctx:
target = Target.CPU # Target.GPU if ctx.cuda else Target.CPU
data_type = "float64" if ctx.double_accuracy else "float32"
pdf_dtype = "float64"
streaming_pattern = 'pull'
timesteps = get_timesteps(streaming_pattern)
omega = sp.symbols("omega")
stencil = LBStencil(Stencil.D3Q27)
dim = stencil.D
pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, method=Method.TRT, relaxation_rate=omega,
streaming_pattern=streaming_pattern, compressible=True)
lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
method = create_lb_method(lbm_config=lbm_config)
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip())
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
generate_lbm_package(ctx, name="Channel",
collision_rule=collision_rule,
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=True, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields, data_type=data_type)
generate_info_header(ctx, 'ChannelHeader')