From db4edb4bb58c686f7b6bd58767bff449b956e7de Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Thu, 14 Jun 2018 16:07:49 +0200 Subject: [PATCH] New Time Loop - makes small block scenarios faster - new time loop caches all kernel functions with their argument dict -> inner loop just calls C functions with cached kwargs --- lbstep.py | 54 ++++++++++++++++++++++++++++++------ macroscopic_value_kernels.py | 39 ++++++++++++++++---------- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/lbstep.py b/lbstep.py index a60e9418..a71bec99 100644 --- a/lbstep.py +++ b/lbstep.py @@ -119,7 +119,10 @@ class LatticeBoltzmannStep: self.ast = self._lbmKernels[0].ast # -- Boundary Handling & Synchronization --- - self._sync = data_handling.synchronization_function([self._pdf_arr_name], method_parameters['stencil'], target) + stencil_name = method_parameters['stencil'] + self._sync_src = data_handling.synchronization_function([self._pdf_arr_name], stencil_name, target) + self._sync_tmp = data_handling.synchronization_function([self._tmp_arr_name], stencil_name, target) + self._boundary_handling = LatticeBoltzmannBoundaryHandling(self.method, self._data_handling, self._pdf_arr_name, name=name + "_boundary_handling", flag_interface=flag_interface, @@ -223,16 +226,42 @@ class LatticeBoltzmannStep: def time_step(self): if len(self._lbmKernels) == 2: # collide stream self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params) - self._sync() + self._sync_src() self._boundary_handling(**self.kernel_params) self._data_handling.run_kernel(self._lbmKernels[1], **self.kernel_params) else: # stream collide - self._sync() + self._sync_src() self._boundary_handling(**self.kernel_params) self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params) self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu) - self.time_steps_run += 1 + + def get_time_loop(self): + self.pre_run() # make sure GPU arrays are allocated + + fixed_loop = TimeLoop(steps=2) + fixed_loop.add_pre_run_function(self.pre_run) + fixed_loop.add_post_run_function(self.post_run) + fixed_loop.add_single_step_function(self.time_step) + + for t in range(2): + if len(self._lbmKernels) == 2: # collide stream + collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params) + fixed_loop.add_call(self._lbmKernels[0], collide_args) + + fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {}) + self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params) + + stream_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[1], **self.kernel_params) + fixed_loop.add_call(self._lbmKernels[1], stream_args) + else: # stream collide + fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {}) + self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params) + stream_collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params) + fixed_loop.add_call(self._lbmKernels[0], stream_collide_args) + + self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu) + return fixed_loop def post_run(self): if self._gpu: @@ -240,24 +269,31 @@ class LatticeBoltzmannStep: self._data_handling.run_kernel(self._getterKernel, **self.kernel_params) def run(self, time_steps): + time_loop = self.get_time_loop() + time_loop.run(time_steps) + self.time_steps_run += time_loop.time_steps_run + + def run_old(self, time_steps): self.pre_run() for i in range(time_steps): self.time_step() self.post_run() + self.time_steps_run += time_steps + def benchmark_run(self, time_steps): - time_loop = TimeLoop() - time_loop.add_step(self) + time_loop = self.get_time_loop() duration_of_time_step = time_loop.benchmark_run(time_steps) mlups = self.number_of_cells / duration_of_time_step * 1e-6 + self.time_steps_run += time_loop.time_steps_run return mlups def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'): - time_loop = TimeLoop() - time_loop.add_step(self) + time_loop = self.get_time_loop() duration_of_time_step = time_loop.benchmark(time_for_benchmark, init_time_steps, number_of_time_steps_for_estimation) mlups = self.number_of_cells / duration_of_time_step * 1e-6 + self.time_steps_run += time_loop.time_steps_run return mlups def write_vtk(self): @@ -328,7 +364,7 @@ class LatticeBoltzmannStep: self._data_handling.all_to_gpu() for i in range(check_residuum_after): steps_run += 1 - self._sync() + self._sync_src() self._boundary_handling(**self.kernel_params) self._data_handling.run_kernel(self._velocity_init_kernel, **self.kernel_params) self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, gpu=gpu) diff --git a/macroscopic_value_kernels.py b/macroscopic_value_kernels.py index d908ac8b..2e712391 100644 --- a/macroscopic_value_kernels.py +++ b/macroscopic_value_kernels.py @@ -1,3 +1,4 @@ +import functools from copy import deepcopy from pystencils.field import Field, get_layout_of_array from lbmpy.simplificationfactory import create_simplification_strategy @@ -7,12 +8,15 @@ def compile_macroscopic_values_getter(lb_method, output_quantities, pdf_arr=None """ Create kernel to compute macroscopic value(s) from a pdf field (e.g. density or velocity) - :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` - :param output_quantities: sequence of quantities to compute e.g. ['density', 'velocity'] - :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel - :param field_layout: layout for output field, also used for pdf field if pdf_arr is not given - :param target: 'cpu' or 'gpu' - :return: a function to compute macroscopic values: + Args: + lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` + output_quantities: sequence of quantities to compute e.g. ['density', 'velocity'] + pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel + field_layout: layout for output field, also used for pdf field if pdf_arr is not given + target: 'cpu' or 'gpu' + + Returns: + a function to compute macroscopic values: - pdf_array - keyword arguments from name of conserved quantity (as in output_quantities) to numpy field """ @@ -83,12 +87,15 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None Creates a function that sets a pdf field to specified macroscopic quantities The returned function can be called with the pdf field to set as single argument - :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` - :param quantities_to_set: map from conserved quantity name to fixed value or numpy array - :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel - :param field_layout: layout of the pdf field if pdf_arr was not given - :param target: 'cpu' or 'gpu' - :return: function taking pdf array as single argument and which sets the field to the given values + Args: + lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` + quantities_to_set: map from conserved quantity name to fixed value or numpy array + pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel + field_layout: layout of the pdf field if pdf_arr was not given + target: 'cpu' or 'gpu' + + Returns: + function taking pdf array as single argument and which sets the field to the given values """ if pdf_arr is not None: pdf_field = Field.create_from_numpy_array('pdfs', pdf_arr, index_dimensions=1) @@ -128,10 +135,12 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None if target == 'cpu': import pystencils.cpu as cpu - kernel = cpu.make_python_function(cpu.create_kernel(eq), argument_dict=fixed_kernel_parameters) + kernel = cpu.make_python_function(cpu.create_kernel(eq)) + kernel = functools.partial(kernel, **fixed_kernel_parameters) elif target == 'gpu': import pystencils.gpucuda as gpu - kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq), argument_dict=fixed_kernel_parameters) + kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq)) + kernel = functools.partial(kernel, **fixed_kernel_parameters) else: raise ValueError("Unknown target '%s'. Possible targets are 'cpu' and 'gpu'" % (target,)) @@ -153,7 +162,7 @@ def create_advanced_velocity_setter_collision_rule(method, velocity_field: Field method: lattice boltzmann method object velocity_field: pystencils field velocity_relaxation_rate: relaxation rate for the velocity moments - determines convergence behaviour - of the initialization scheme + of the initialization scheme Returns: LB collision rule -- GitLab