diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs_RDM.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs_RDM.py new file mode 100755 index 0000000000000000000000000000000000000000..5d8614573c06a9993cc971dfcb489008c6b58956 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs_RDM.py @@ -0,0 +1,319 @@ +import os +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle +import sys +import sqlite3 +from math import prod + +try: + import machinestate as ms +except ImportError: + ms = None + +# Number of time steps run for a workload of 128^3 per GPU +# if double as many cells are on the GPU, half as many time steps are run etc. +# increase this to get more reliable measurements +TIME_STEPS_FOR_128_BLOCK = 1000 +DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3") +BENCHMARK = int(os.environ.get('BENCHMARK', 0)) + +WeakX = int(os.environ.get('WeakX', 128)) +WeakY = int(os.environ.get('WeakY', 128)) +WeakZ = int(os.environ.get('WeakZ', 128)) + +StrongX = int(os.environ.get('StrongX', 128)) +StrongY = int(os.environ.get('StrongY', 128)) +StrongZ = int(os.environ.get('StrongZ', 128)) + +BASE_CONFIG = { + 'DomainSetup': { + 'cellsPerBlock': (256, 128, 128), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'omega': 1.8, + 'cudaEnabledMPI': False, + 'warmupSteps': 5, + 'outerIterations': 3, + } +} + +ldc_setup = {'Border': [ + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, +]} + + +def num_time_steps(block_size, time_steps_for_128_block=1000): + """ + Calculate the number of time steps based on the block size. + + This function computes the number of time steps required for a given block size + by scaling the time steps that could be executed on one process within one second + for a 128x128x128 cells_per_block to the given cells_per_block size. + + Parameters: + block_size (tuple): A tuple of three integers representing the dimensions of the cells_per_block (x, y, z). + time_steps_for_128_block (int, optional): The number of time steps for a 128x128x128 block. Default is 1000, + which is approximately the number of timesteps that were executed within one second on one entire MareNostrum5-acc node. + + Returns: + int: The calculated number of time steps, with a minimum value of 10. + """ + cells = block_size[0] * block_size[1] * block_size[2] + time_steps = (128 ** 3 / cells) * time_steps_for_128_block + if time_steps < 10: + time_steps = 10 + return int(time_steps) + + +def cuda_block_size_ok(block_size, regs_per_threads=168): + """ + Checks if a given CUDA block size does not exceed the SM register limit. + 168 registers per thread was obtained using cuobjdump on both SRT and Cumulant + kernels. You might want to validate that for your own kernels. + """ + return prod(block_size) * regs_per_threads < 64 * (2 ** 10) + + +def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): + """ + Checks if a single block of given size fits into GPU memory. + """ + return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem + + +class Scenario: + def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1), + timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False, + inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, + init_shear_flow=False, boundary_setup=False, + vtk_write_frequency=0, remaining_time_logger_frequency=-1, + additional_info=None, blocks=None, db_file_name=None): + + if boundary_setup: + init_shear_flow = False + periodic = (0, 0, 0) + + self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses()) + + self.cells_per_block = cells_per_block + self.periodic = periodic + + self.time_step_strategy = time_step_strategy + self.omega = omega + self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block) + self.cuda_enabled_mpi = cuda_enabled_mpi + self.inner_outer_split = inner_outer_split + self.init_shear_flow = init_shear_flow + self.boundary_setup = boundary_setup + self.warmup_steps = warmup_steps + self.outer_iterations = outer_iterations + self.cuda_blocks = cuda_blocks + + self.vtk_write_frequency = vtk_write_frequency + self.remaining_time_logger_frequency = remaining_time_logger_frequency + self.db_file_name = DB_FILE if db_file_name is None else db_file_name + + self.config_dict = self.config(print_dict=False) + self.additional_info = additional_info + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'DomainSetup': { + 'blocks': self.blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic, + }, + 'Parameters': { + 'omega': self.omega, + 'cudaEnabledMPI': self.cuda_enabled_mpi, + 'warmupSteps': self.warmup_steps, + 'outerIterations': self.outer_iterations, + 'timeStepStrategy': self.time_step_strategy, + 'timesteps': self.timesteps, + 'initShearFlow': self.init_shear_flow, + 'gpuBlockSize': self.cuda_blocks, + 'innerOuterSplit': self.inner_outer_split, + 'vtkWriteFrequency': self.vtk_write_frequency, + 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency + }, + 'Logging': { + 'logLevel': 'info', # info progress detail tracing + } + } + if self.boundary_setup: + config_dict["Boundaries"] = ldc_setup + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + if self.additional_info: + wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info)) + return config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + + if self.additional_info is not None: + data.update(self.additional_info) + + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + + if ms: + state = ms.MachineState(extended=False, anonymous=True) + state.generate() # generate subclasses + state.update() # read information + data["MachineState"] = str(state.get()) + else: + print("MachineState module is not available. MachineState was not saved") + + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + # check multiple times e.g. may fail when multiple benchmark processes are running + table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}" + table_name = table_name.replace("-", "_") # - not allowed for table name would lead to syntax error + for num_try in range(num_tries): + try: + checkAndUpdateSchema(result, table_name, self.db_file_name) + storeSingle(result, table_name, self.db_file_name) + break + except sqlite3.OperationalError as e: + wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") + + +# -------------------------------------- Functions trying different parameter sets ----------------------------------- + + +def weak_scaling_overlap(cuda_enabled_mpi=False): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running scaling benchmark with communication hiding") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + + # overlap + for t in ["simpleOverlap"]: + scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ), + cuda_blocks=(128, 1, 1), + time_step_strategy=t, + inner_outer_split=(8, 8, 8), + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1, + boundary_setup=True, + db_file_name="weakScalingUniformGrid.sqlite3")) + + +def strong_scaling_overlap(cuda_enabled_mpi=False): + wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + + domain_size = (StrongX, StrongY, StrongZ) + blocks = block_decomposition(wlb.mpi.numProcesses()) + cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))]) + + # overlap + for t in ["simpleOverlap"]: + scenarios.add(Scenario(cells_per_block=cells_per_block, + cuda_blocks=(128, 1, 1), + time_step_strategy=t, + inner_outer_split=(1, 1, 1), + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1, + timesteps=num_time_steps(cells_per_block), + blocks=blocks, + boundary_setup=True, + db_file_name="strongScalingUniformGridOneBlock.sqlite3")) + + +def single_gpu_benchmark(): + """Benchmarks only the LBM compute kernel""" + wlb.log_info_on_root("Running single GPU benchmarks") + wlb.log_info_on_root("") + + gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40)) + gpu_mem = gpu_mem_gb * (2 ** 30) + gpu_type = os.environ.get('GPU_TYPE') + + additional_info = {} + if gpu_type is not None: + additional_info['gpu_type'] = gpu_type + + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (128, 256, 320)] + cuda_blocks = [(128, 1, 1), ] + for block_size in block_sizes: + for cuda_block_size in cuda_blocks: + # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1) + if cuda_block_size > block_size: + continue + if not cuda_block_size_ok(cuda_block_size): + wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.") + continue + if not domain_block_size_ok(block_size, gpu_mem): + wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.") + continue + scenario = Scenario(cells_per_block=block_size, + cuda_blocks=cuda_block_size, + time_step_strategy='kernelOnly', + timesteps=num_time_steps(block_size, 2000), + outer_iterations=1, + additional_info=additional_info) + scenarios.add(scenario) + + +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + wlb.log_info_on_root("") + + time_step_strategy = "noOverlap" # "simpleOverlap" + + scenarios = wlb.ScenarioManager() + scenario = Scenario(cells_per_block=(128, 128, 128), + time_step_strategy=time_step_strategy, + timesteps=10001, + outer_iterations=1, + warmup_steps=0, + init_shear_flow=False, + boundary_setup=True, + vtk_write_frequency=5000, + remaining_time_logger_frequency=30) + scenarios.add(scenario) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +# Select the benchmark you want to run +# single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU +# performance of compute kernel (no communication) +# overlap_benchmark() # benchmarks different communication overlap options +# profiling() # run only two timesteps on a smaller domain for profiling only +# validation_run() + +if BENCHMARK == 0: + single_gpu_benchmark() +elif BENCHMARK == 1: + weak_scaling_overlap(True) +elif BENCHMARK == 2: + strong_scaling_overlap(True) +else: + validation_run() +