Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • sudesh.rathnayake/walberla
  • castellsc/walberla
  • hoenig/walberla
  • el38efib/walberla
  • holzer/walberla
  • em73etav/walberla
  • ob28imeq/walberla
  • ArashPartow/walberla
  • jarmatz/walberla
  • ec93ujoh/walberla
  • Bindgen/walberla
  • jbadwaik/walberla
  • ravi.k.ayyala/walberla
  • ProjectPhysX/walberla
  • le45zyci/walberla
  • da15siwa/walberla
  • shellshocked2003/walberla
  • stewart/walberla
  • Novermars/walberla
  • behzad.safaei/walberla
  • schruff/walberla
  • rahil.doshi/walberla
  • loreson/walberla
  • itischler/walberla
  • walberla/walberla
  • he66coqe/walberla
  • jngrad/walberla
  • uq60ifih/walberla
  • ostanin/walberla
  • bauer/walberla
  • zy79zopo/walberla
  • jonas_schmitt/walberla
  • po60nani/walberla
  • ro36vugi/walberla
  • fweik/walberla
  • ab04unyc/walberla
  • yw25ynew/walberla
  • ig38otak/walberla
  • RudolfWeeber/walberla
39 results
Select Git revision
Show changes
Showing
with 3024 additions and 298 deletions
...@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen ...@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
if (WALBERLA_BUILD_WITH_GPU_SUPPORT ) if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
waLBerla_add_executable(NAME benchmark_multiphase waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen) DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
else () else ()
waLBerla_add_executable(NAME benchmark_multiphase waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen) DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
endif (WALBERLA_BUILD_WITH_GPU_SUPPORT ) endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
...@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition ...@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
import sys import sys
from math import prod from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8): def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
"""Checks if a single block of given size fits into GPU memory""" """Checks if a single block of given size fits into GPU memory"""
...@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s ...@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s
class Scenario: class Scenario:
def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256), def __init__(self, time_step_strategy,
cuda_block_size,
cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False): cuda_enabled_mpi=False):
# output frequencies # output frequencies
self.vtkWriteFrequency = 0 self.vtkWriteFrequency = 0
...@@ -89,6 +96,14 @@ class Scenario: ...@@ -89,6 +96,14 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data) sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data]) df = pd.DataFrame.from_records([data])
...@@ -101,43 +116,19 @@ class Scenario: ...@@ -101,43 +116,19 @@ class Scenario:
def benchmark(): def benchmark():
scenarios = wlb.ScenarioManager() scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8)) gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
gpu_mem = gpu_mem_gb * (2 ** 30) gpu_mem = gpu_mem_gb * (2 ** 30)
block_size = (256, 256, 256) block_size = (320, 320, 320)
cuda_enabled_mpi = True
if not domain_block_size_ok(block_size, gpu_mem): if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.") wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
else: else:
scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size)) scenarios.add(Scenario(time_step_strategy='normal',
cuda_block_size=(128, 1, 1),
cells_per_block=block_size,
cuda_enabled_mpi=cuda_enabled_mpi))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)] benchmark()
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2),
(32, 8, 1),
(16, 16, 1)]
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for cuda_block in cuda_blocks:
for block_size in block_sizes:
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
continue
scenario = Scenario(time_step_strategy=time_step_strategy,
cuda_block_size=cuda_block,
cells_per_block=block_size)
scenarios.add(scenario)
# benchmark()
kernel_benchmark()
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "field/vtk/VTKWriter.h" #include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h" #include "geometry/InitBoundaryHandling.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "python_coupling/CreateConfig.h" #include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h" #include "python_coupling/DictWrapper.h"
...@@ -78,14 +79,10 @@ int main(int argc, char** argv) ...@@ -78,14 +79,10 @@ int main(int argc, char** argv)
logging::configureLogging(config); logging::configureLogging(config);
shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config); shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters // Reading parameters
auto parameters = config->getOneBlock("Parameters"); auto parameters = config->getOneBlock("Parameters");
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1)); const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2)); const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
...@@ -102,6 +99,7 @@ int main(int argc, char** argv) ...@@ -102,6 +99,7 @@ int main(int argc, char** argv)
gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true); gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
BlockDataID phase_field_gpu = BlockDataID phase_field_gpu =
gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true); gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
#else #else
BlockDataID lb_phase_field = BlockDataID lb_phase_field =
field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx); field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
...@@ -109,6 +107,7 @@ int main(int argc, char** argv) ...@@ -109,6 +107,7 @@ int main(int argc, char** argv)
field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx); field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx); BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
#endif #endif
if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only") if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
...@@ -139,47 +138,80 @@ int main(int argc, char** argv) ...@@ -139,47 +138,80 @@ int main(int argc, char** argv)
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu); pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
pystencils::phase_field_LB_step phase_field_LB_step( pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
gpuBlockSize[1], gpuBlockSize[2]); gpuBlockSize[1], gpuBlockSize[2]);
#else #else
pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field); pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field); pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field); pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field); pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
#endif #endif
// add communication // add communication
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false); const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
auto Comm_velocity_based_distributions = const int streamLowPriority = 0;
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); const int streamHighPriority = 0;
auto generatedPackInfo_velocity_based_distributions = auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu); auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu); auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
auto Comm_phase_field_distributions = auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
auto generatedPackInfo_phase_field_distributions = auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions); UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
#else UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks); auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#else
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field); auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field); auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions); auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks); auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto generatedPackInfo_phase_field_distributions = auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions); auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< PhaseField_T >(phase_field);
auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#endif #endif
BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
...@@ -201,99 +233,37 @@ int main(int argc, char** argv) ...@@ -201,99 +233,37 @@ int main(int argc, char** argv)
init_h(&block); init_h(&block);
init_g(&block); init_g(&block);
} }
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done") WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
} }
SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
int const streamLowPriority = 0; timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
int const streamHighPriority = 0; << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority); << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
#endif
auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps); timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
#if defined(WALBERLA_BUILD_WITH_CUDA) << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
auto normalTimeStep = [&]() { timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
Comm_velocity_based_distributions->startCommunication(); << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
for (auto& block : *blocks)
phase_field_LB_step(&block, defaultStream); timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
Comm_velocity_based_distributions->wait();
Comm_phase_field_distributions->startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block, defaultStream);
Comm_phase_field_distributions->wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#else #else
auto normalTimeStep = [&]() { timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
Comm_velocity_based_distributions.startCommunication(); << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
for (auto& block : *blocks) << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
phase_field_LB_step(&block);
Comm_velocity_based_distributions.wait();
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block);
Comm_phase_field_distributions.wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#endif
std::function< void() > timeStep;
if (timeStepStrategy == "phase_only")
{
timeStep = std::function< void() >(phase_only);
WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
}
else if (timeStepStrategy == "hydro_only")
{
timeStep = std::function< void() >(hydro_only);
WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
}
else if (timeStepStrategy == "kernel_only")
{
timeStep = std::function< void() >(without_comm);
WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
}
else
{
timeStep = std::function< void() >(normalTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
}
timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step"); timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
<< Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
<< AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
// remaining time logger timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
if (remainingTimeLoggerFrequency > 0) #endif
timeLoop->addFuncAfterTimeStep(
timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
"remaining time logger");
uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 1) if (vtkWriteFrequency > 1)
...@@ -307,40 +277,60 @@ int main(int argc, char** argv) ...@@ -307,40 +277,60 @@ int main(int argc, char** argv)
auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase"); auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
vtkOutput->addCellDataWriter(phaseWriter); vtkOutput->addCellDataWriter(phaseWriter);
timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
} }
lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
fluidCells();
WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
for (uint_t i = 0; i < warmupSteps; ++i) for (uint_t i = 0; i < warmupSteps; ++i)
timeLoop->singleStep(); timeloop.singleStep();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
timeLoop->setCurrentTimeStepToZero(); timeloop.setCurrentTimeStepToZero();
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
WcTimingPool timeloopTiming;
WcTimer simTimer; WcTimer simTimer;
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize(); WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
#endif #endif
simTimer.start(); simTimer.start();
timeLoop->run(); timeloop.run(timeloopTiming);
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize(); WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
#endif #endif
WALBERLA_MPI_BARRIER()
simTimer.end(); simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
auto time = real_c(simTimer.last()); double time = simTimer.max();
auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; performance.logResultOnRoot(timesteps, time);
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
const auto reducedTimeloopTiming = timeloopTiming.getReduced();
WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s") WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
WALBERLA_ROOT_SECTION() WALBERLA_ROOT_SECTION()
{ {
python_coupling::PythonCallback pythonCallbackResults("results_callback"); python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable()) if (pythonCallbackResults.isCallable())
{ {
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess); pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase); pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro); pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi); pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
#endif #endif
// Call Python function to report results // Call Python function to report results
pythonCallbackResults(); pythonCallbackResults();
......
...@@ -19,7 +19,7 @@ with CodeGeneration() as ctx: ...@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32" field_type = "float64" if ctx.double_accuracy else "float32"
stencil_phase = LBStencil(Stencil.D3Q15) stencil_phase = LBStencil(Stencil.D3Q15)
stencil_hydro = LBStencil(Stencil.D3Q27) stencil_hydro = LBStencil(Stencil.D3Q19)
assert (stencil_phase.D == stencil_hydro.D) assert (stencil_phase.D == stencil_hydro.D)
######################## ########################
...@@ -76,13 +76,13 @@ with CodeGeneration() as ctx: ...@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
delta_equilibrium=False, delta_equilibrium=False,
force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u, force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
weighted=True, relaxation_rates=rates, weighted=True, relaxation_rates=rates,
output={'density': C_tmp}, kernel_type='stream_pull_collide') output={'density': C_tmp})
method_phase = create_lb_method(lbm_config=lbm_config_phase) method_phase = create_lb_method(lbm_config=lbm_config_phase)
lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False, lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
weighted=True, relaxation_rate=omega, weighted=True, relaxation_rate=omega,
force=sp.symbols(f"F_:{stencil_hydro.D}"), force=sp.symbols(f"F_:{stencil_hydro.D}"),
output={'velocity': u}, kernel_type='collide_stream_push') output={'velocity': u})
method_hydro = create_lb_method(lbm_config=lbm_config_hydro) method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
# create the kernels for the initialization of the g and h field # create the kernels for the initialization of the g and h field
...@@ -137,7 +137,8 @@ with CodeGeneration() as ctx: ...@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
sweep_params = {'block_size': sweep_block_size} sweep_params = {'block_size': sweep_block_size}
stencil_typedefs = {'Stencil_phase_T': stencil_phase, stencil_typedefs = {'Stencil_phase_T': stencil_phase,
'Stencil_hydro_T': stencil_hydro} 'Stencil_hydro_T': stencil_hydro,
'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
field_typedefs = {'PdfField_phase_T': h, field_typedefs = {'PdfField_phase_T': h,
'PdfField_hydro_T': g, 'PdfField_hydro_T': g,
'VelocityField_T': u, 'VelocityField_T': u,
...@@ -156,7 +157,7 @@ with CodeGeneration() as ctx: ...@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU) generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step, generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)], field_swaps=[(h, h_tmp)],
inner_outer_split=True, inner_outer_split=True,
cpu_vectorize_info=cpu_vec, cpu_vectorize_info=cpu_vec,
target=Target.CPU) target=Target.CPU)
...@@ -172,7 +173,7 @@ with CodeGeneration() as ctx: ...@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.CPU) streaming_pattern='pull', target=Target.CPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g, generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.CPU) streaming_pattern='pull', target=Target.CPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU) generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)
...@@ -183,7 +184,7 @@ with CodeGeneration() as ctx: ...@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
g_updates, target=Target.GPU) g_updates, target=Target.GPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step, generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)], field_swaps=[(h, h_tmp)],
target=Target.GPU, target=Target.GPU,
gpu_indexing_params=sweep_params, gpu_indexing_params=sweep_params,
varying_parameters=vp) varying_parameters=vp)
...@@ -198,7 +199,7 @@ with CodeGeneration() as ctx: ...@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.GPU) streaming_pattern='pull', target=Target.GPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g, generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.GPU) streaming_pattern='pull', target=Target.GPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU) generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "core/debug/CheckFunctions.h" #include "core/debug/CheckFunctions.h"
#include "core/debug/Debug.h" #include "core/debug/Debug.h"
#include "core/debug/TestSubsystem.h" #include "core/debug/TestSubsystem.h"
#include "core/logging/Initialization.h"
#include "core/logging/Logging.h" #include "core/logging/Logging.h"
#include "core/math/Constants.h" #include "core/math/Constants.h"
#include "core/math/Sample.h" #include "core/math/Sample.h"
...@@ -2889,10 +2890,9 @@ int main( int argc, char **argv ) ...@@ -2889,10 +2890,9 @@ int main( int argc, char **argv )
" ('fzyx') data layout!" << std::endl; " ('fzyx') data layout!" << std::endl;
} }
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
logging::Logging::printHeaderOnStream(); logging::Logging::printHeaderOnStream();
//WALBERLA_ROOT_SECTION() { logging::Logging::instance()->setLogLevel( logging::Logging::PROGRESS ); }
#ifdef _OPENMP #ifdef _OPENMP
if( std::getenv( "OMP_NUM_THREADS" ) == nullptr ) if( std::getenv( "OMP_NUM_THREADS" ) == nullptr )
...@@ -2903,6 +2903,7 @@ int main( int argc, char **argv ) ...@@ -2903,6 +2903,7 @@ int main( int argc, char **argv )
// open configuration file // open configuration file
shared_ptr< Config > config = make_shared< Config >(); shared_ptr< Config > config = make_shared< Config >();
logging::configureLogging(config);
config->readParameterFile( argv[1] ); config->readParameterFile( argv[1] );
Config::BlockHandle configBlock = config->getBlock( "SchaeferTurek" ); Config::BlockHandle configBlock = config->getBlock( "SchaeferTurek" );
......
...@@ -29,31 +29,31 @@ SchaeferTurek ...@@ -29,31 +29,31 @@ SchaeferTurek
////////////////////////////// //////////////////////////////
memoryPerCell 153; // in bytes memoryPerCell 153; // in bytes
processMemoryLimit 2048; // in MiB ! processMemoryLimit 4048; // in MiB !
yzBlocks 1; // blocks in y- and z-direction [2D: only 1 block in z-direction] yzBlocks 4; // blocks in y- and z-direction [2D: only 1 block in z-direction]
// The number of blocks in x-direction is calculated from: H, L, yzBlocks, xCells, and yzCells. // The number of blocks in x-direction is calculated from: H, L, yzBlocks, xCells, and yzCells.
xCells 40; // number of cells in x-direction for each block xCells 64; // number of cells in x-direction for each block
yzCells 40; // number of cells in y- and z-direction for each block [2D: only affects y-direction] yzCells 16; // number of cells in y- and z-direction for each block [2D: only affects y-direction]
H 0.41; // [m] H 0.41; // [m]
L 20.0; // [m] (2.5 [3D], 2.2 [2D]) L 2.5; // [m] (2.5 [3D], 2.2 [2D])
strictlyObeyL false;//true; // true = outflow will be located exactly at "L", false = outflow might be slightly further away than "L" strictlyObeyL false;//true; // true = outflow will be located exactly at "L", false = outflow might be slightly further away than "L"
cylinderxPosition 0.5; // [m] (0.5 [3D], 0.2 [2D]) cylinderxPosition 0.5; // [m] (0.5 [3D], 0.2 [2D])
cylinderyPosition 0.2; // [m] cylinderyPosition 0.2; // [m]
cylinderRadius 0.18; // [m] cylinderRadius 0.05; // [m]
circularCrossSection true; // true = obstacle is a true cylinder, false = obstacle is a box circularCrossSection true; // true = obstacle is a true cylinder, false = obstacle is a box
kinViscosity 0.001; // [m^2/s] kinViscosity 0.001; // [m^2/s]
rho 1; // [kg/m^3] rho 1; // [kg/m^3]
inflowVelocity 2; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D]) inflowVelocity 0.45; // [m/s] (0.45 [3D], 2.25 [3D], 0.3 [2D], 1.5 [2D])
raisingTime 0.3; // [s] (0 == immediatelly full velocity) raisingTime 0; // [s] (0 == immediatelly full velocity)
sinPeriod 0; // [s] (0 == no sinus term) sinPeriod 0; // [s] (0 == no sinus term)
obstacleBoundary 0; // 0 = staircase, 1 = curved obstacleBoundary 1; // 0 = staircase, 1 = curved
outletType 1; // 0 = pressure, 1 = outlet (2/1), 2 = outlet (4/3) outletType 1; // 0 = pressure, 1 = outlet (2/1), 2 = outlet (4/3)
initWithVelocity false; // at the beginning of the simulation everything is set to: ( 4 * inflowVelocity ) / 9 [3D] or ( 2 * inflowVelocity ) / 3 [2D] initWithVelocity false; // at the beginning of the simulation everything is set to: ( 4 * inflowVelocity ) / 9 [3D] or ( 2 * inflowVelocity ) / 3 [2D]
...@@ -63,7 +63,7 @@ SchaeferTurek ...@@ -63,7 +63,7 @@ SchaeferTurek
nbrOfEvaluationPointsForCoefficientExtremas 100; nbrOfEvaluationPointsForCoefficientExtremas 100;
evaluatePressure true; evaluatePressure false;
pAlpha < 0.45, 0.2, 0.205 >; // points for evaluating pAlpha < 0.45, 0.2, 0.205 >; // points for evaluating
pOmega < 0.55, 0.2, 0.205 >; // the pressure difference pOmega < 0.55, 0.2, 0.205 >; // the pressure difference
...@@ -74,9 +74,9 @@ SchaeferTurek ...@@ -74,9 +74,9 @@ SchaeferTurek
// WHERE TO REFINE ? // // WHERE TO REFINE ? //
/////////////////////// ///////////////////////
useCylinderForRefinement false; useCylinderForRefinement true;
cylinderRefinementLevel 2; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level cylinderRefinementLevel 0; // if "useCylinderForRefinement" is true, everything around the cylinder is refined to this level
cylinderRefinementBuffer 0.05; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel" cylinderRefinementBuffer 0.0; // [m] - additional space around the cylinder that is also guaranteed to be at level "cylinderRefinementLevel"
AABBRefinementSelection AABBRefinementSelection
{ {
...@@ -149,11 +149,11 @@ SchaeferTurek ...@@ -149,11 +149,11 @@ SchaeferTurek
//////////////////////////// ////////////////////////////
//minSimulationTime 10; // in [s] - ATTENTION: _IF_ 'minSimulationTime' is specified, 'outerTimeSteps' is adapted accordingly! //minSimulationTime 10; // in [s] - ATTENTION: _IF_ 'minSimulationTime' is specified, 'outerTimeSteps' is adapted accordingly!
outerTimeSteps 2; // total number of time steps = outerTimeSteps * innerTimeSteps outerTimeSteps 1; // total number of time steps = outerTimeSteps * innerTimeSteps
innerTimeSteps 5000; // For each outer loop, performance data is logged. innerTimeSteps 20001; // For each outer loop, performance data is logged.
evaluationCheckFrequency 0; // 0 = disable evaluation evaluationCheckFrequency 100; // 0 = disable evaluation
evaluationLogToStream false; evaluationLogToStream true;
evaluationLogToFile true; evaluationLogToFile true;
evaluationFilename SchaeferTurek.txt; evaluationFilename SchaeferTurek.txt;
...@@ -175,7 +175,7 @@ SchaeferTurek ...@@ -175,7 +175,7 @@ SchaeferTurek
vtkMPIIO true; vtkMPIIO true;
} }
remainingTimeLoggerFrequency 5; // in seconds remainingTimeLoggerFrequency 10; // in seconds
vtkBeforeTimeStep true; // false = at the end of time step vtkBeforeTimeStep true; // false = at the end of time step
...@@ -243,18 +243,20 @@ SchaeferTurek ...@@ -243,18 +243,20 @@ SchaeferTurek
checkStrouhalNbrDiscreteDUpperBound 1E6; // upper bound for the Strouhal number (only checked if "evaluateStrouhal" is set to true - "discrete" cylinder diameter used for evaluation) checkStrouhalNbrDiscreteDUpperBound 1E6; // upper bound for the Strouhal number (only checked if "evaluateStrouhal" is set to true - "discrete" cylinder diameter used for evaluation)
} }
Logging
{
logLevel info; // info progress detail tracing
}
VTK VTK
{ {
/*
fluid_field fluid_field
{ {
baseFolder vtk; baseFolder vtk;
//initialWriteCallsToSkip 55130; //initialWriteCallsToSkip 55130;
writeFrequency 304;//102; writeFrequency 304;//102;
ghostLayers 1; ghostLayers 0;
//AABB_filter_0 { //AABB_filter_0 {
// min < 0, 0, 0.2049999 >; // min < 0, 0, 0.2049999 >;
...@@ -276,18 +278,16 @@ VTK ...@@ -276,18 +278,16 @@ VTK
} }
writers { writers {
//VelocityFromPDF; VelocityFromPDF;
VelocityMagnitudeFromPDF; VelocityMagnitudeFromPDF;
//DensityFromPDF; DensityFromPDF;
//NonEquPart; //NonEquPart;
//PDF; //PDF;
} }
incompatibleBlockStates empty; incompatibleBlockStates empty;
} }
*/
/*
flag_field flag_field
{ {
baseFolder vtk; baseFolder vtk;
...@@ -302,8 +302,7 @@ VTK ...@@ -302,8 +302,7 @@ VTK
incompatibleBlockStates empty; incompatibleBlockStates empty;
} }
*/
/* /*
domain_decomposition domain_decomposition
{ {
......
...@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" ) ...@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist) foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27) foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky) foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined # KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue() continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup}) set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config} waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
FILE UniformGridCPU.py FILE UniformGridCPU.py
......
...@@ -235,12 +235,16 @@ int main(int argc, char** argv) ...@@ -235,12 +235,16 @@ int main(int argc, char** argv)
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes()); pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads()); pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores()); pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time)); pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time)); pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil); pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("vectorised", vectorised);
pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results // Call Python function to report results
......
...@@ -3,17 +3,12 @@ from dataclasses import replace ...@@ -3,17 +3,12 @@ from dataclasses import replace
import sympy as sp import sympy as sp
import pystencils as ps import pystencils as ps
from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\
insert_symbol_times_minus_one
from lbmpy.advanced_streaming import is_inplace from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
from lbmpy.enums import Method, Stencil from lbmpy.enums import Method, Stencil, SubgridScaleModel
from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
from lbmpy.moments import get_default_moment_set_for_stencil from lbmpy.moments import get_default_moment_set_for_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
...@@ -21,48 +16,53 @@ from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator ...@@ -21,48 +16,53 @@ from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
omega = sp.symbols('omega') omega = sp.symbols('omega')
omega_free = sp.Symbol('omega_free') omega_free = sp.Symbol('omega_free')
# best configs in terms of FLOPS
options_dict = { options_dict = {
'srt': { 'srt': {
'method': Method.SRT, 'method': Method.SRT,
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': True, 'compressible': False,
}, },
'trt': { 'trt': {
'method': Method.TRT, 'method': Method.TRT,
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': True, 'compressible': False,
}, },
'r-w-mrt': { 'mrt': {
'method': Method.MRT, 'method': Method.MRT,
'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1], 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'compressible': True, 'compressible': False,
}, },
'w-mrt': { 'mrt-overrelax': {
'method': Method.MRT, 'method': Method.MRT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True, 'compressible': False,
}, },
'r-cm': { 'central': {
'method': Method.CENTRAL_MOMENT, 'method': Method.CENTRAL_MOMENT,
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': True, 'compressible': True,
}, },
'cm': { 'central-overrelax': {
'method': Method.CENTRAL_MOMENT, 'method': Method.CENTRAL_MOMENT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True, 'compressible': True,
}, },
'r-k': { 'cumulant': {
'method': Method.CUMULANT, 'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': True, 'compressible': True,
}, },
'k': { 'cumulant-overrelax': {
'method': Method.CUMULANT, 'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)], 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True, 'compressible': True,
}, },
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': { 'entropic': {
'method': Method.TRT_KBC_N4, 'method': Method.TRT_KBC_N4,
'compressible': True, 'compressible': True,
...@@ -73,7 +73,12 @@ options_dict = { ...@@ -73,7 +73,12 @@ options_dict = {
}, },
'smagorinsky': { 'smagorinsky': {
'method': Method.SRT, 'method': Method.SRT,
'smagorinsky': False, 'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'relaxation_rate': omega,
},
'qr': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.QR,
'relaxation_rate': omega, 'relaxation_rate': omega,
} }
} }
...@@ -83,6 +88,8 @@ info_header = """ ...@@ -83,6 +88,8 @@ info_header = """
const char * infoStencil = "{stencil}"; const char * infoStencil = "{stencil}";
const char * infoStreamingPattern = "{streaming_pattern}"; const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}"; const char * infoCollisionSetup = "{collision_setup}";
const bool vectorised = {vec};
const bool nontemporal = {nt_stores};
const bool infoCseGlobal = {cse_global}; const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs}; const bool infoCsePdfs = {cse_pdfs};
""" """
...@@ -90,10 +97,15 @@ const bool infoCsePdfs = {cse_pdfs}; ...@@ -90,10 +97,15 @@ const bool infoCsePdfs = {cse_pdfs};
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
openmp = True if ctx.openmp else False openmp = True if ctx.openmp else False
field_type = "float64" if ctx.double_accuracy else "float32" field_type = "float64" if ctx.double_accuracy else "float32"
if ctx.optimize_for_localhost: # This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
cpu_vec = {"nontemporal": False, "assume_aligned": True} # only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
else: # gains especially for the pull scheme on skylake architectures
cpu_vec = None base_pointer_spec = None # [['spatialInner0'], ['spatialInner1']]
# cpu_vec = {"instruction_set": "best", "nontemporal": False,
# "assume_aligned": True, 'assume_sufficient_line_padding': True}
cpu_vec = {"instruction_set": None}
nt_stores = False
config_tokens = ctx.config.split('_') config_tokens = ctx.config.split('_')
...@@ -110,7 +122,6 @@ with CodeGeneration() as ctx: ...@@ -110,7 +122,6 @@ with CodeGeneration() as ctx:
raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment") raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}" assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[collision_setup] options = options_dict[collision_setup]
assert stencil.D == 3, "This application supports only three-dimensional stencils" assert stencil.D == 3, "This application supports only three-dimensional stencils"
...@@ -121,23 +132,20 @@ with CodeGeneration() as ctx: ...@@ -121,23 +132,20 @@ with CodeGeneration() as ctx:
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx') lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
# This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
# not seperated. This is done to get a fair comparison with the monomial cumulants.
if lbm_config.method == Method.CENTRAL_MOMENT: if lbm_config.method == Method.CENTRAL_MOMENT:
lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil)) lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
if not is_inplace(streaming_pattern): if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp) lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
field_swaps = [(pdfs, pdfs_tmp)]
else:
field_swaps = []
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
# is_inplace is set to False to ensure that the streaming is done with src and dst field. # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
# accessor = CollideOnlyInplaceAccessor() stream_only_kernel = []
accessor = get_accessor(streaming_pattern, Timestep.EVEN) for i in range(stencil.Q):
#accessor.is_inplace = False stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor)
# LB Sweep # LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
...@@ -152,17 +160,26 @@ with CodeGeneration() as ctx: ...@@ -152,17 +160,26 @@ with CodeGeneration() as ctx:
lbm_config=lbm_config, lbm_optimisation=lbm_opt, lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=False, boundaries=[no_slip, ubb], nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields, macroscopic_fields=macroscopic_fields,
cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
base_pointer_specification=base_pointer_spec)
# Stream only kernel # Stream only kernel
cpu_vec_stream = None
if ctx.optimize_for_localhost:
cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
"assume_aligned": True, 'assume_sufficient_line_padding': True,
"assume_inner_stride_one": True}
generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
field_swaps=field_swaps_stream_only, target=ps.Target.CPU, cpu_openmp=openmp,
target=ps.Target.CPU, cpu_openmp=openmp) cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])
infoHeaderParams = { infoHeaderParams = {
'stencil': stencil_str, 'stencil': stencil_str,
'streaming_pattern': streaming_pattern, 'streaming_pattern': streaming_pattern,
'collision_setup': collision_setup, 'collision_setup': collision_setup,
'vec': int(True if cpu_vec else False),
'nt_stores': int(nt_stores),
'cse_global': int(lbm_opt.cse_global), 'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs), 'cse_pdfs': int(lbm_opt.cse_pdfs),
} }
......
...@@ -4,13 +4,26 @@ from waLBerla.tools.config import block_decomposition ...@@ -4,13 +4,26 @@ from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys import sys
import sqlite3 import sqlite3
from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per process # Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc. # if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements # increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 10 TIME_STEPS_FOR_128_BLOCK = 10
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3") DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK): def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
...@@ -35,7 +48,7 @@ class Scenario: ...@@ -35,7 +48,7 @@ class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1, def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1), timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False, warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1): vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
if boundary_setup: if boundary_setup:
init_shear_flow = False init_shear_flow = False
...@@ -58,6 +71,7 @@ class Scenario: ...@@ -58,6 +71,7 @@ class Scenario:
self.vtk_write_frequency = vtk_write_frequency self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False) self.config_dict = self.config(print_dict=False)
...@@ -101,6 +115,15 @@ class Scenario: ...@@ -101,6 +115,15 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data) sequenceValuesToScalars(data)
result = data result = data
...@@ -111,8 +134,8 @@ class Scenario: ...@@ -111,8 +134,8 @@ class Scenario:
table_name = table_name.replace("-", "_") table_name = table_name.replace("-", "_")
for num_try in range(num_tries): for num_try in range(num_tries):
try: try:
checkAndUpdateSchema(result, table_name, DB_FILE) checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, DB_FILE) storeSingle(result, table_name, self.db_file_name)
break break
except sqlite3.OperationalError as e: except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
...@@ -156,18 +179,38 @@ def overlap_benchmark(): ...@@ -156,18 +179,38 @@ def overlap_benchmark():
scenarios.add(scenario) scenarios.add(scenario)
def scaling_benchmark(): def weak_scaling_benchmark():
"""Tests different communication overlapping strategies""" wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
wlb.log_info_on_root("Running scaling benchmark")
wlb.log_info_on_root("") wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager() scenarios = wlb.ScenarioManager()
cells_per_block = [(32, 32, 32), (128, 128, 128)]
for cell_per_block in cells_per_block: for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(time_step_strategy='noOverlap', scenarios.add(Scenario(time_step_strategy=t,
inner_outer_split=(1, 1, 1), inner_outer_split=(1, 1, 1),
cells_per_block=cell_per_block)) cells_per_block=(WeakX, WeakY, WeakZ),
boundary_setup=True,
outer_iterations=1,
db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
def strong_scaling_benchmark():
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
time_step_strategy=t,
outer_iterations=1,
timesteps=10,
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_node_benchmark(): def single_node_benchmark():
...@@ -176,13 +219,11 @@ def single_node_benchmark(): ...@@ -176,13 +219,11 @@ def single_node_benchmark():
wlb.log_info_on_root("") wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager() scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)] scenario = Scenario(cells_per_block=(128, 128, 128),
for block_size in block_sizes: time_step_strategy='kernelOnly',
scenario = Scenario(cells_per_block=block_size, outer_iterations=1,
time_step_strategy='kernelOnly', timesteps=10)
outer_iterations=1, scenarios.add(scenario)
timesteps=num_time_steps(block_size))
scenarios.add(scenario)
def validation_run(): def validation_run():
...@@ -211,5 +252,15 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI ...@@ -211,5 +252,15 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
# performance of compute kernel (no communication) # performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options # overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only # profiling() # run only two timesteps on a smaller domain for profiling only
validation_run() # validation_run()
# scaling_benchmark() # scaling_benchmark()
if BENCHMARK == 0:
single_node_benchmark()
elif BENCHMARK == 1:
weak_scaling_benchmark()
elif BENCHMARK == 2:
strong_scaling_benchmark()
else:
validation_run()
...@@ -3,13 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" ) ...@@ -3,13 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist) foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27) foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky) foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined # KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue() continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup}) set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py FILE UniformGridGPU.py
......
...@@ -136,7 +136,7 @@ int main(int argc, char** argv) ...@@ -136,7 +136,7 @@ int main(int argc, char** argv)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// LB SWEEPS AND BOUNDARY HANDLING /// /// LB SWEEPS AND BOUNDARY HANDLING ///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);
// Boundaries // Boundaries
const FlagUID fluidFlagUID("Fluid"); const FlagUID fluidFlagUID("Fluid");
...@@ -264,6 +264,13 @@ int main(int argc, char** argv) ...@@ -264,6 +264,13 @@ int main(int argc, char** argv)
python_coupling::PythonCallback pythonCallbackResults("results_callback"); python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable()) if (pythonCallbackResults.isCallable())
{ {
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil); pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
......
...@@ -4,17 +4,16 @@ import pystencils as ps ...@@ -4,17 +4,16 @@ import pystencils as ps
from dataclasses import replace from dataclasses import replace
from pystencils import Assignment
from pystencils.typing import TypedSymbol from pystencils.typing import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
from lbmpy.advanced_streaming import is_inplace from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.moments import get_default_moment_set_for_stencil from lbmpy.moments import get_default_moment_set_for_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from lbmpy.fieldaccess import *
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
...@@ -74,6 +73,12 @@ options_dict = { ...@@ -74,6 +73,12 @@ options_dict = {
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)], 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True, 'compressible': True,
}, },
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': { 'entropic': {
'method': Method.TRT_KBC_N4, 'method': Method.TRT_KBC_N4,
'compressible': True, 'compressible': True,
...@@ -84,7 +89,12 @@ options_dict = { ...@@ -84,7 +89,12 @@ options_dict = {
}, },
'smagorinsky': { 'smagorinsky': {
'method': Method.SRT, 'method': Method.SRT,
'smagorinsky': False, 'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'relaxation_rate': omega,
},
'qr': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.QR,
'relaxation_rate': omega, 'relaxation_rate': omega,
} }
} }
...@@ -101,7 +111,8 @@ const bool infoCsePdfs = {cse_pdfs}; ...@@ -101,7 +111,8 @@ const bool infoCsePdfs = {cse_pdfs};
optimize = True optimize = True
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32" pdf_data_type = "float64"
field_data_type = "float64"
config_tokens = ctx.config.split('_') config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3 assert len(config_tokens) >= 3
...@@ -124,8 +135,8 @@ with CodeGeneration() as ctx: ...@@ -124,8 +135,8 @@ with CodeGeneration() as ctx:
options = options_dict[collision_setup] options = options_dict[collision_setup]
assert stencil.D == 3, "This application supports only three-dimensional stencils" assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field} macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
...@@ -140,13 +151,12 @@ with CodeGeneration() as ctx: ...@@ -140,13 +151,12 @@ with CodeGeneration() as ctx:
else: else:
field_swaps = [] field_swaps = []
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
# is_inplace is set to False to ensure that the streaming is done with src and dst field. # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
accessor = CollideOnlyInplaceAccessor() stream_only_kernel = []
accessor.is_inplace = False for i in range(stencil.Q):
field_swaps_stream_only = [(pdfs, pdfs_tmp)] stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
# LB Sweep # LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
...@@ -158,9 +168,10 @@ with CodeGeneration() as ctx: ...@@ -158,9 +168,10 @@ with CodeGeneration() as ctx:
lb_method = collision_rule.method lb_method = collision_rule.method
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip()) boundary_object=NoSlip(), field_data_type=pdf_data_type)
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([0.05, 0, 0], data_type=field_type)) boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
field_data_type=pdf_data_type)
generate_lbm_package(ctx, name="UniformGridGPU", generate_lbm_package(ctx, name="UniformGridGPU",
collision_rule=collision_rule, collision_rule=collision_rule,
...@@ -168,12 +179,12 @@ with CodeGeneration() as ctx: ...@@ -168,12 +179,12 @@ with CodeGeneration() as ctx:
nonuniform=False, boundaries=[no_slip, ubb], nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields, macroscopic_fields=macroscopic_fields,
target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params, target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
data_type=field_data_type, pdfs_data_type=pdf_data_type,
max_threads=max_threads) max_threads=max_threads)
# Stream only kernel # Stream only kernel
vp = [('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize2')] generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
max_threads=max_threads) max_threads=max_threads)
infoHeaderParams = { infoHeaderParams = {
......
...@@ -6,11 +6,25 @@ import sys ...@@ -6,11 +6,25 @@ import sys
import sqlite3 import sqlite3
from math import prod from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per GPU # Number of time steps run for a workload of 128^3 per GPU
# if double as many cells are on the GPU, half as many time steps are run etc. # if double as many cells are on the GPU, half as many time steps are run etc.
# increase this to get more reliable measurements # increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 1000 TIME_STEPS_FOR_128_BLOCK = 1000
DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3") DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
BASE_CONFIG = { BASE_CONFIG = {
'DomainSetup': { 'DomainSetup': {
...@@ -39,6 +53,8 @@ ldc_setup = {'Border': [ ...@@ -39,6 +53,8 @@ ldc_setup = {'Border': [
def num_time_steps(block_size, time_steps_for_128_block=200): def num_time_steps(block_size, time_steps_for_128_block=200):
cells = block_size[0] * block_size[1] * block_size[2] cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block time_steps = (128 ** 3 / cells) * time_steps_for_128_block
if time_steps < 10:
time_steps = 10
return int(time_steps) return int(time_steps)
...@@ -61,13 +77,13 @@ class Scenario: ...@@ -61,13 +77,13 @@ class Scenario:
inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
init_shear_flow=False, boundary_setup=False, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1, vtk_write_frequency=0, remaining_time_logger_frequency=-1,
additional_info=None): additional_info=None, blocks=None, db_file_name=None):
if boundary_setup: if boundary_setup:
init_shear_flow = False init_shear_flow = False
periodic = (0, 0, 0) periodic = (0, 0, 0)
self.blocks = block_decomposition(wlb.mpi.numProcesses()) self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block self.cells_per_block = cells_per_block
self.periodic = periodic self.periodic = periodic
...@@ -85,6 +101,7 @@ class Scenario: ...@@ -85,6 +101,7 @@ class Scenario:
self.vtk_write_frequency = vtk_write_frequency self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False) self.config_dict = self.config(print_dict=False)
self.additional_info = additional_info self.additional_info = additional_info
...@@ -97,7 +114,6 @@ class Scenario: ...@@ -97,7 +114,6 @@ class Scenario:
'blocks': self.blocks, 'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block, 'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic, 'periodic': self.periodic,
'oneBlockPerProcess': True
}, },
'Parameters': { 'Parameters': {
'omega': self.omega, 'omega': self.omega,
...@@ -115,7 +131,6 @@ class Scenario: ...@@ -115,7 +131,6 @@ class Scenario:
'Logging': { 'Logging': {
'logLevel': 'info', # info progress detail tracing 'logLevel': 'info', # info progress detail tracing
} }
} }
if self.boundary_setup: if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup config_dict["Boundaries"] = ldc_setup
...@@ -140,6 +155,15 @@ class Scenario: ...@@ -140,6 +155,15 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data) sequenceValuesToScalars(data)
result = data result = data
...@@ -150,8 +174,8 @@ class Scenario: ...@@ -150,8 +174,8 @@ class Scenario:
table_name = table_name.replace("-", "_") # - not allowed for table name would lead to syntax error table_name = table_name.replace("-", "_") # - not allowed for table name would lead to syntax error
for num_try in range(num_tries): for num_try in range(num_tries):
try: try:
checkAndUpdateSchema(result, table_name, DB_FILE) checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, DB_FILE) storeSingle(result, table_name, self.db_file_name)
break break
except sqlite3.OperationalError as e: except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
...@@ -200,12 +224,70 @@ def overlap_benchmark(): ...@@ -200,12 +224,70 @@ def overlap_benchmark():
scenarios.add(scenario) scenarios.add(scenario)
def no_overlap_scaling(cuda_enabled_mpi=False):
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark without communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
# no overlap
scenarios.add(Scenario(cells_per_block=(256, 256, 256),
cuda_blocks=(128, 1, 1),
time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1))
def weak_scaling_overlap(cuda_enabled_mpi=False):
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running scaling benchmark with communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
# overlap
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
cuda_blocks=(128, 1, 1),
time_step_strategy=t,
inner_outer_split=(64, 64, 64),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1,
boundary_setup=True,
db_file_name="weakScalingUniformGrid.sqlite3"))
def strong_scaling_overlap(cuda_enabled_mpi=False):
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
# overlap
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
cuda_blocks=(128, 1, 1),
time_step_strategy=t,
inner_outer_split=(1, 1, 1),
cuda_enabled_mpi=cuda_enabled_mpi,
outer_iterations=1,
timesteps=50,
blocks=blocks,
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_gpu_benchmark(): def single_gpu_benchmark():
"""Benchmarks only the LBM compute kernel""" """Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single GPU benchmarks") wlb.log_info_on_root("Running single GPU benchmarks")
wlb.log_info_on_root("") wlb.log_info_on_root("")
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8)) gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
gpu_mem = gpu_mem_gb * (2 ** 30) gpu_mem = gpu_mem_gb * (2 ** 30)
gpu_type = os.environ.get('GPU_TYPE') gpu_type = os.environ.get('GPU_TYPE')
...@@ -214,12 +296,8 @@ def single_gpu_benchmark(): ...@@ -214,12 +296,8 @@ def single_gpu_benchmark():
additional_info['gpu_type'] = gpu_type additional_info['gpu_type'] = gpu_type
scenarios = wlb.ScenarioManager() scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (32, 64, 128, 256)] block_sizes = [(i, i, i) for i in (128, 256, 320)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), cuda_blocks = [(128, 1, 1), ]
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 8, 1), (64, 8, 1),
(32, 16, 1)]
for block_size in block_sizes: for block_size in block_sizes:
for cuda_block_size in cuda_blocks: for cuda_block_size in cuda_blocks:
# cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1) # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
...@@ -266,4 +344,14 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI ...@@ -266,4 +344,14 @@ wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FI
# performance of compute kernel (no communication) # performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options # overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only # profiling() # run only two timesteps on a smaller domain for profiling only
validation_run() # validation_run()
if BENCHMARK == 0:
single_gpu_benchmark()
elif BENCHMARK == 1:
weak_scaling_overlap(True)
elif BENCHMARK == 2:
strong_scaling_overlap(True)
else:
validation_run()
...@@ -17,7 +17,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) ...@@ -17,7 +17,8 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES} "-Wl,-no-whole-archive" ) set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES} "-Wl,-no-whole-archive" )
endif() endif()
add_library( walberla_cpp SHARED PythonModule.cpp ) add_library( walberla_cpp SHARED PythonModule.cpp
../showcases/FlowAroundSphere/FlowAroundSphere.cpp)
target_link_libraries( walberla_cpp ${WALBERLA_LINK_LIBRARIES_KEYWORD} ${pythonModules} ${SERVICE_LIBS} ) target_link_libraries( walberla_cpp ${WALBERLA_LINK_LIBRARIES_KEYWORD} ${pythonModules} ${SERVICE_LIBS} )
......
...@@ -10,6 +10,11 @@ add_subdirectory( PegIntoSphereBed ) ...@@ -10,6 +10,11 @@ add_subdirectory( PegIntoSphereBed )
if ( WALBERLA_BUILD_WITH_CODEGEN) if ( WALBERLA_BUILD_WITH_CODEGEN)
add_subdirectory( Antidunes ) add_subdirectory( Antidunes )
add_subdirectory( FlowAroundSphere )
add_subdirectory( FlowAroundSphereCPU )
add_subdirectory( FlowAroundCylinder )
add_subdirectory( Channel )
add_subdirectory( TaylorGreenVortex )
if (WALBERLA_BUILD_WITH_PYTHON) if (WALBERLA_BUILD_WITH_PYTHON)
add_subdirectory( PhaseFieldAllenCahn ) add_subdirectory( PhaseFieldAllenCahn )
......
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_generate_target_from_python(NAME ChannelGenerated
FILE Channel.py
OUT_FILES ChannelStorageSpecification.h ChannelStorageSpecification.cpp
ChannelSweepCollection.h ChannelSweepCollection.cpp
NoSlip.h NoSlip.cpp
UBB.h UBB.cpp
ChannelBoundaryCollection.h
ChannelHeader.h)
waLBerla_add_executable ( NAME Channel
FILES Channel.cpp
DEPENDS ChannelGenerated blockforest core field geometry lbm_generated timeloop )
This diff is collapsed.
Parameters
{
omega 1.4;
timesteps 10;
maxLatticeVelocity 0.01;
remainingTimeLoggerFrequency 0; // in seconds
vtkWriteFrequency 0;
}
DomainSetup
{
blocks < 2, 2, 1 >;
cellsPerBlock < 4, 4, 4 >;
periodic < 1, 0, 1 >;
refinementLevels 1;
numberProcesses 1; // This is for load balancing, overwritten if more than one proc is used
}
Boundaries
{
Border { direction S; walldistance -1; flag NoSlip; }
Border { direction N; walldistance -1; flag UBB; }
}
VTKWriter
{
vtkWriteFrequency 1;
velocity true;
density true;
averageFields true;
flag false;
writeOnlySlice false;
amrFileFormat false;
oneFilePerProcess false;
}
Logging
{
logLevel info; // info progress detail tracing
writeSetupForestAndReturn false;
remainingTimeLoggerFrequency 60; // in seconds
}
Evaluation
{
evaluationCheckFrequency 100;
logToStream true;
logToFile true;
filename Channel.txt;
}
import sympy as sp
from pystencils import Target
from pystencils import fields
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
from pystencils_walberla import CodeGeneration, generate_info_header
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
import warnings
warnings.filterwarnings("ignore")
with CodeGeneration() as ctx:
target = Target.CPU # Target.GPU if ctx.cuda else Target.CPU
data_type = "float64" if ctx.double_accuracy else "float32"
pdf_dtype = "float64"
streaming_pattern = 'pull'
timesteps = get_timesteps(streaming_pattern)
omega = sp.symbols("omega")
stencil = LBStencil(Stencil.D3Q27)
dim = stencil.D
pdfs, pdfs_tmp = fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_dtype}[3D]", layout='fzyx')
velocity_field, density_field = fields(f"velocity({dim}), density(1) : {data_type}[{dim}D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, method=Method.TRT, relaxation_rate=omega,
streaming_pattern=streaming_pattern, compressible=True)
lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
method = create_lb_method(lbm_config=lbm_config)
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip())
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([sp.Symbol("u_x"), 0, 0], data_type=data_type))
generate_lbm_package(ctx, name="Channel",
collision_rule=collision_rule,
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=True, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields, data_type=data_type)
generate_info_header(ctx, 'ChannelHeader')