From db4edb4bb58c686f7b6bd58767bff449b956e7de Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Thu, 14 Jun 2018 16:07:49 +0200
Subject: [PATCH] New Time Loop - makes small block scenarios faster

- new time loop caches all kernel functions with their argument dict
  -> inner loop just calls C functions with cached kwargs
---
 lbstep.py                    | 54 ++++++++++++++++++++++++++++++------
 macroscopic_value_kernels.py | 39 ++++++++++++++++----------
 2 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/lbstep.py b/lbstep.py
index a60e9418..a71bec99 100644
--- a/lbstep.py
+++ b/lbstep.py
@@ -119,7 +119,10 @@ class LatticeBoltzmannStep:
         self.ast = self._lbmKernels[0].ast
 
         # -- Boundary Handling  & Synchronization ---
-        self._sync = data_handling.synchronization_function([self._pdf_arr_name], method_parameters['stencil'], target)
+        stencil_name = method_parameters['stencil']
+        self._sync_src = data_handling.synchronization_function([self._pdf_arr_name], stencil_name, target)
+        self._sync_tmp = data_handling.synchronization_function([self._tmp_arr_name], stencil_name, target)
+
         self._boundary_handling = LatticeBoltzmannBoundaryHandling(self.method, self._data_handling, self._pdf_arr_name,
                                                                    name=name + "_boundary_handling",
                                                                    flag_interface=flag_interface,
@@ -223,16 +226,42 @@ class LatticeBoltzmannStep:
     def time_step(self):
         if len(self._lbmKernels) == 2:  # collide stream
             self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
-            self._sync()
+            self._sync_src()
             self._boundary_handling(**self.kernel_params)
             self._data_handling.run_kernel(self._lbmKernels[1], **self.kernel_params)
         else:  # stream collide
-            self._sync()
+            self._sync_src()
             self._boundary_handling(**self.kernel_params)
             self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
 
         self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
-        self.time_steps_run += 1
+
+    def get_time_loop(self):
+        self.pre_run()  # make sure GPU arrays are allocated
+
+        fixed_loop = TimeLoop(steps=2)
+        fixed_loop.add_pre_run_function(self.pre_run)
+        fixed_loop.add_post_run_function(self.post_run)
+        fixed_loop.add_single_step_function(self.time_step)
+
+        for t in range(2):
+            if len(self._lbmKernels) == 2:  # collide stream
+                collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[0], collide_args)
+
+                fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
+                self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
+
+                stream_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[1], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[1], stream_args)
+            else:  # stream collide
+                fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
+                self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
+                stream_collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
+                fixed_loop.add_call(self._lbmKernels[0], stream_collide_args)
+
+            self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
+        return fixed_loop
 
     def post_run(self):
         if self._gpu:
@@ -240,24 +269,31 @@ class LatticeBoltzmannStep:
         self._data_handling.run_kernel(self._getterKernel, **self.kernel_params)
 
     def run(self, time_steps):
+        time_loop = self.get_time_loop()
+        time_loop.run(time_steps)
+        self.time_steps_run += time_loop.time_steps_run
+
+    def run_old(self, time_steps):
         self.pre_run()
         for i in range(time_steps):
             self.time_step()
         self.post_run()
 
+        self.time_steps_run += time_steps
+
     def benchmark_run(self, time_steps):
-        time_loop = TimeLoop()
-        time_loop.add_step(self)
+        time_loop = self.get_time_loop()
         duration_of_time_step = time_loop.benchmark_run(time_steps)
         mlups = self.number_of_cells / duration_of_time_step * 1e-6
+        self.time_steps_run += time_loop.time_steps_run
         return mlups
 
     def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'):
-        time_loop = TimeLoop()
-        time_loop.add_step(self)
+        time_loop = self.get_time_loop()
         duration_of_time_step = time_loop.benchmark(time_for_benchmark, init_time_steps,
                                                     number_of_time_steps_for_estimation)
         mlups = self.number_of_cells / duration_of_time_step * 1e-6
+        self.time_steps_run += time_loop.time_steps_run
         return mlups
 
     def write_vtk(self):
@@ -328,7 +364,7 @@ class LatticeBoltzmannStep:
             self._data_handling.all_to_gpu()
             for i in range(check_residuum_after):
                 steps_run += 1
-                self._sync()
+                self._sync_src()
                 self._boundary_handling(**self.kernel_params)
                 self._data_handling.run_kernel(self._velocity_init_kernel, **self.kernel_params)
                 self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, gpu=gpu)
diff --git a/macroscopic_value_kernels.py b/macroscopic_value_kernels.py
index d908ac8b..2e712391 100644
--- a/macroscopic_value_kernels.py
+++ b/macroscopic_value_kernels.py
@@ -1,3 +1,4 @@
+import functools
 from copy import deepcopy
 from pystencils.field import Field, get_layout_of_array
 from lbmpy.simplificationfactory import create_simplification_strategy
@@ -7,12 +8,15 @@ def compile_macroscopic_values_getter(lb_method, output_quantities, pdf_arr=None
     """
     Create kernel to compute macroscopic value(s) from a pdf field (e.g. density or velocity)
 
-    :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
-    :param output_quantities: sequence of quantities to compute e.g. ['density', 'velocity']
-    :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
-    :param field_layout: layout for output field, also used for pdf field if pdf_arr is not given
-    :param target: 'cpu' or 'gpu'
-    :return: a function to compute macroscopic values:
+    Args:
+        lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
+        output_quantities: sequence of quantities to compute e.g. ['density', 'velocity']
+        pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
+        field_layout: layout for output field, also used for pdf field if pdf_arr is not given
+        target: 'cpu' or 'gpu'
+
+    Returns:
+        a function to compute macroscopic values:
         - pdf_array
         - keyword arguments from name of conserved quantity (as in output_quantities) to numpy field
     """
@@ -83,12 +87,15 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
     Creates a function that sets a pdf field to specified macroscopic quantities
     The returned function can be called with the pdf field to set as single argument
 
-    :param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
-    :param quantities_to_set: map from conserved quantity name to fixed value or numpy array
-    :param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
-    :param field_layout: layout of the pdf field if pdf_arr was not given
-    :param target: 'cpu' or 'gpu'
-    :return: function taking pdf array as single argument and which sets the field to the given values
+    Args:
+        lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
+        quantities_to_set: map from conserved quantity name to fixed value or numpy array
+        pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
+        field_layout: layout of the pdf field if pdf_arr was not given
+        target: 'cpu' or 'gpu'
+
+    Returns:
+        function taking pdf array as single argument and which sets the field to the given values
     """
     if pdf_arr is not None:
         pdf_field = Field.create_from_numpy_array('pdfs', pdf_arr, index_dimensions=1)
@@ -128,10 +135,12 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
 
     if target == 'cpu':
         import pystencils.cpu as cpu
-        kernel = cpu.make_python_function(cpu.create_kernel(eq), argument_dict=fixed_kernel_parameters)
+        kernel = cpu.make_python_function(cpu.create_kernel(eq))
+        kernel = functools.partial(kernel, **fixed_kernel_parameters)
     elif target == 'gpu':
         import pystencils.gpucuda as gpu
-        kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq), argument_dict=fixed_kernel_parameters)
+        kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq))
+        kernel = functools.partial(kernel, **fixed_kernel_parameters)
     else:
         raise ValueError("Unknown target '%s'. Possible targets are 'cpu' and 'gpu'" % (target,))
 
@@ -153,7 +162,7 @@ def create_advanced_velocity_setter_collision_rule(method, velocity_field: Field
         method: lattice boltzmann method object
         velocity_field: pystencils field
         velocity_relaxation_rate: relaxation rate for the velocity moments - determines convergence behaviour
-                                of the initialization scheme
+                                  of the initialization scheme
 
     Returns:
         LB collision rule
-- 
GitLab