diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs_RDM.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs_RDM.py new file mode 100644 index 0000000000000000000000000000000000000000..ce2a499308a2d27a3ca83f7c6ccc6bba599f2be2 --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs_RDM.py @@ -0,0 +1,244 @@ +import os +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle +import sys +import sqlite3 +from pprint import pformat + +try: + import machinestate as ms +except ImportError: + ms = None + +# Number of time steps run for a workload of 128^3 per process +# if double as many cells are on the process, half as many time steps are run etc. +# increase this to get more reliable measurements +TIME_STEPS_FOR_128_BLOCK = int(os.environ.get('TIME_STEPS_FOR_128_BLOCK', 100)) +DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3") +BENCHMARK = int(os.environ.get('BENCHMARK', 0)) + +WeakX = int(os.environ.get('WeakX', 128)) +WeakY = int(os.environ.get('WeakY', 128)) +WeakZ = int(os.environ.get('WeakZ', 128)) + +StrongX = int(os.environ.get('StrongX', 128)) +StrongY = int(os.environ.get('StrongY', 128)) +StrongZ = int(os.environ.get('StrongZ', 128)) + + +def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK): + """ + Calculate the number of time steps based on the block size. + + This function computes the number of time steps required for a given block size + by scaling the time steps that could be executed on one process within one second + for a 128x128x128 cells_per_block to the given cells_per_block size. + + Parameters: + block_size (tuple): A tuple of three integers representing the dimensions of the cells_per_block (x, y, z). + time_steps_for_128_block (int, optional): The number of time steps for a 128x128x128 block. Default is 100. + + Returns: + int: The calculated number of time steps, with a minimum value of 5. + """ + cells = block_size[0] * block_size[1] * block_size[2] + time_steps = (128 ** 3 / cells) * time_steps_for_128_block + if time_steps < 5: + time_steps = 5 + return int(time_steps) + + +ldc_setup = {'Border': [ + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, +]} + + +class Scenario: + def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1, + timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1), + warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False, + vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None): + + if boundary_setup: + init_shear_flow = False + periodic = (0, 0, 0) + + self.blocks_per_process = blocks_per_process + self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses()) + + self.cells_per_block = cells_per_block + self.periodic = periodic + + self.time_step_strategy = time_step_strategy + self.omega = omega + self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block) + self.inner_outer_split = inner_outer_split + self.init_shear_flow = init_shear_flow + self.boundary_setup = boundary_setup + self.warmup_steps = warmup_steps + self.outer_iterations = outer_iterations + + self.vtk_write_frequency = vtk_write_frequency + self.remaining_time_logger_frequency = remaining_time_logger_frequency + self.db_file_name = DB_FILE if db_file_name is None else db_file_name + + self.config_dict = self.config(print_dict=False) + + @wlb.member_callback + def config(self, print_dict=True): + config_dict = { + 'DomainSetup': { + 'blocks': self.blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic, + 'cartesianSetup': (self.blocks_per_process == 1) + }, + 'Parameters': { + 'omega': self.omega, + 'warmupSteps': self.warmup_steps, + 'outerIterations': self.outer_iterations, + 'timeStepStrategy': self.time_step_strategy, + 'timesteps': self.timesteps, + 'initShearFlow': self.init_shear_flow, + 'innerOuterSplit': self.inner_outer_split, + 'vtkWriteFrequency': self.vtk_write_frequency, + 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency + } + } + if self.boundary_setup: + config_dict["Boundaries"] = ldc_setup + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + + if ms: + state = ms.MachineState(extended=False, anonymous=True) + state.generate() # generate subclasses + state.update() # read information + data["MachineState"] = str(state.get()) + else: + print("MachineState module is not available. MachineState was not saved") + + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + # check multiple times e.g. may fail when multiple benchmark processes are running + table_name = "runs" + table_name = table_name.replace("-", "_") + for num_try in range(num_tries): + try: + checkAndUpdateSchema(result, table_name, self.db_file_name) + storeSingle(result, table_name, self.db_file_name) + break + except sqlite3.OperationalError as e: + wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") + + +# -------------------------------------- Functions trying different parameter sets ----------------------------------- + + +def weak_scaling_benchmark(): + wlb.log_info_on_root("Running weak scaling benchmark with one block per proc") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + + for t in ["simpleOverlap"]: + scenarios.add(Scenario(time_step_strategy=t, + inner_outer_split=(1, 1, 1), + cells_per_block=(WeakX, WeakY, WeakZ), + boundary_setup=True, + outer_iterations=1, + db_file_name="weakScalingUniformGridOneBlock.sqlite3")) + + +def strong_scaling_benchmark(): + wlb.log_info_on_root("Running strong scaling benchmark with one block per proc") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + + domain_size = (StrongX, StrongY, StrongZ) + blocks = block_decomposition(wlb.mpi.numProcesses()) + cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))]) + + for t in ["simpleOverlap"]: + scenarios.add(Scenario(cells_per_block=cells_per_block, + time_step_strategy=t, + outer_iterations=1, + timesteps=num_time_steps(cells_per_block), + boundary_setup=True, + db_file_name="strongScalingUniformGridOneBlock.sqlite3")) + + +def single_node_benchmark(): + """Benchmarks only the LBM compute kernel""" + wlb.log_info_on_root("Running single Node benchmarks") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + scenario = Scenario(cells_per_block=(128, 128, 128), + time_step_strategy='kernelOnly', + outer_iterations=1, + timesteps=10) + scenarios.add(scenario) + + +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + wlb.log_info_on_root("") + + time_step_strategy = "noOverlap" # "noOverlap" + + scenarios = wlb.ScenarioManager() + scenario = Scenario(cells_per_block=(64, 64, 64), + time_step_strategy=time_step_strategy, + timesteps=201, + outer_iterations=1, + warmup_steps=0, + init_shear_flow=False, + boundary_setup=True, + vtk_write_frequency=50, + remaining_time_logger_frequency=10) + scenarios.add(scenario) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +# Select the benchmark you want to run +# single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU +# performance of compute kernel (no communication) +# overlap_benchmark() # benchmarks different communication overlap options +# profiling() # run only two timesteps on a smaller domain for profiling only +# validation_run() +# scaling_benchmark() + +if BENCHMARK == 0: + single_node_benchmark() +elif BENCHMARK == 1: + weak_scaling_benchmark() +elif BENCHMARK == 2: + strong_scaling_benchmark() +else: + validation_run()