New Time Loop - makes small block scenarios faster

- new time loop caches all kernel functions with their argument dict -> inner loop just calls C functions with cached kwargs

New Time Loop - makes small block scenarios faster
db4edb4b · Martin Bauer · 8a88243e · db4edb4b · db4edb4b
Commit db4edb4b authored 7 years ago by Martin Bauer
--- a/lbstep.py
+++ b/lbstep.py
@@ -119,7 +119,10 @@ class LatticeBoltzmannStep:
        self.ast = self._lbmKernels[0].ast
        # -- Boundary Handling  & Synchronization ---
-        self._sync = data_handling.synchronization_function([self._pdf_arr_name], method_parameters['stencil'], target)
+        stencil_name = method_parameters['stencil']
+        self._sync_src = data_handling.synchronization_function([self._pdf_arr_name], stencil_name, target)
+        self._sync_tmp = data_handling.synchronization_function([self._tmp_arr_name], stencil_name, target)
        self._boundary_handling = LatticeBoltzmannBoundaryHandling(self.method, self._data_handling, self._pdf_arr_name,
                                                                   name=name + "_boundary_handling",
                                                                   flag_interface=flag_interface,
@@ -223,16 +226,42 @@ class LatticeBoltzmannStep:
    def time_step(self):
        if len(self._lbmKernels) == 2:  # collide stream
            self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
-            self._sync()
+            self._sync_src()
            self._boundary_handling(**self.kernel_params)
            self._data_handling.run_kernel(self._lbmKernels[1], **self.kernel_params)
        else:  # stream collide
-            self._sync()
+            self._sync_src()
            self._boundary_handling(**self.kernel_params)
            self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
        self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
-        self.time_steps_run += 1
+    def get_time_loop(self):
+        self.pre_run()  # make sure GPU arrays are allocated
+        fixed_loop = TimeLoop(steps=2)
+        fixed_loop.add_pre_run_function(self.pre_run)
+        fixed_loop.add_post_run_function(self.post_run)
+        fixed_loop.add_single_step_function(self.time_step)
+        for t in range(2):
+            if len(self._lbmKernels) == 2:  # collide stream
+                collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[0], collide_args)
+                fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
+                self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
+                stream_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[1], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[1], stream_args)
+            else:  # stream collide
+                fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
+                self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
+                stream_collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[0], stream_collide_args)
+            self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
+        return fixed_loop
    def post_run(self):
        if self._gpu:
@@ -240,24 +269,31 @@ class LatticeBoltzmannStep:
        self._data_handling.run_kernel(self._getterKernel, **self.kernel_params)
    def run(self, time_steps):
+        time_loop = self.get_time_loop()
+        time_loop.run(time_steps)
+        self.time_steps_run += time_loop.time_steps_run
+    def run_old(self, time_steps):
        self.pre_run()
        for i in range(time_steps):
            self.time_step()
        self.post_run()
+        self.time_steps_run += time_steps
    def benchmark_run(self, time_steps):
-        time_loop = TimeLoop()
+        time_loop = self.get_time_loop()
-        time_loop.add_step(self)
        duration_of_time_step = time_loop.benchmark_run(time_steps)
        mlups = self.number_of_cells / duration_of_time_step * 1e-6
+        self.time_steps_run += time_loop.time_steps_run
        return mlups
    def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'):
-        time_loop = TimeLoop()
+        time_loop = self.get_time_loop()
-        time_loop.add_step(self)
        duration_of_time_step = time_loop.benchmark(time_for_benchmark, init_time_steps,
                                                    number_of_time_steps_for_estimation)
        mlups = self.number_of_cells / duration_of_time_step * 1e-6
+        self.time_steps_run += time_loop.time_steps_run
        return mlups
    def write_vtk(self):
@@ -328,7 +364,7 @@ class LatticeBoltzmannStep:
            self._data_handling.all_to_gpu()
            for i in range(check_residuum_after):
                steps_run += 1
-                self._sync()
+                self._sync_src()
                self._boundary_handling(**self.kernel_params)
                self._data_handling.run_kernel(self._velocity_init_kernel, **self.kernel_params)
                self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, gpu=gpu)

--- a/macroscopic_value_kernels.py
+++ b/macroscopic_value_kernels.py
+import functools
 from copy import deepcopy
 from pystencils.field import Field, get_layout_of_array
 from lbmpy.simplificationfactory import create_simplification_strategy
@@ -7,12 +8,15 @@ def compile_macroscopic_values_getter(lb_method, output_quantities, pdf_arr=None
    """
    Create kernel to compute macroscopic value(s) from a pdf field (e.g. density or velocity)
-    :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
+    Args:
-    :param output_quantities: sequence of quantities to compute e.g. ['density', 'velocity']
+        lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
-    :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
+        output_quantities: sequence of quantities to compute e.g. ['density', 'velocity']
-    :param field_layout: layout for output field, also used for pdf field if pdf_arr is not given
+        pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
-    :param target: 'cpu' or 'gpu'
+        field_layout: layout for output field, also used for pdf field if pdf_arr is not given
-    :return: a function to compute macroscopic values:
+        target: 'cpu' or 'gpu'
+    Returns:
+        a function to compute macroscopic values:
        - pdf_array
        - keyword arguments from name of conserved quantity (as in output_quantities) to numpy field
    """
@@ -83,12 +87,15 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
    Creates a function that sets a pdf field to specified macroscopic quantities
    The returned function can be called with the pdf field to set as single argument
-    :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
+    Args:
-    :param quantities_to_set: map from conserved quantity name to fixed value or numpy array
+        lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
-    :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
+        quantities_to_set: map from conserved quantity name to fixed value or numpy array
-    :param field_layout: layout of the pdf field if pdf_arr was not given
+        pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
-    :param target: 'cpu' or 'gpu'
+        field_layout: layout of the pdf field if pdf_arr was not given
-    :return: function taking pdf array as single argument and which sets the field to the given values
+        target: 'cpu' or 'gpu'
+    Returns:
+        function taking pdf array as single argument and which sets the field to the given values
    """
    if pdf_arr is not None:
        pdf_field = Field.create_from_numpy_array('pdfs', pdf_arr, index_dimensions=1)
@@ -128,10 +135,12 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
    if target == 'cpu':
        import pystencils.cpu as cpu
-        kernel = cpu.make_python_function(cpu.create_kernel(eq), argument_dict=fixed_kernel_parameters)
+        kernel = cpu.make_python_function(cpu.create_kernel(eq))
+        kernel = functools.partial(kernel, **fixed_kernel_parameters)
    elif target == 'gpu':
        import pystencils.gpucuda as gpu
-        kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq), argument_dict=fixed_kernel_parameters)
+        kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq))
+        kernel = functools.partial(kernel, **fixed_kernel_parameters)
    else:
        raise ValueError("Unknown target '%s'. Possible targets are 'cpu' and 'gpu'" % (target,))
@@ -153,7 +162,7 @@ def create_advanced_velocity_setter_collision_rule(method, velocity_field: Field
        method: lattice boltzmann method object
        velocity_field: pystencils field
        velocity_relaxation_rate: relaxation rate for the velocity moments - determines convergence behaviour
-                                of the initialization scheme
+                                  of the initialization scheme
    Returns:
        LB collision rule