Skip to content
Snippets Groups Projects
Commit db4edb4b authored by Martin Bauer's avatar Martin Bauer
Browse files

New Time Loop - makes small block scenarios faster

- new time loop caches all kernel functions with their argument dict
  -> inner loop just calls C functions with cached kwargs
parent 8a88243e
No related branches found
No related tags found
No related merge requests found
...@@ -119,7 +119,10 @@ class LatticeBoltzmannStep: ...@@ -119,7 +119,10 @@ class LatticeBoltzmannStep:
self.ast = self._lbmKernels[0].ast self.ast = self._lbmKernels[0].ast
# -- Boundary Handling & Synchronization --- # -- Boundary Handling & Synchronization ---
self._sync = data_handling.synchronization_function([self._pdf_arr_name], method_parameters['stencil'], target) stencil_name = method_parameters['stencil']
self._sync_src = data_handling.synchronization_function([self._pdf_arr_name], stencil_name, target)
self._sync_tmp = data_handling.synchronization_function([self._tmp_arr_name], stencil_name, target)
self._boundary_handling = LatticeBoltzmannBoundaryHandling(self.method, self._data_handling, self._pdf_arr_name, self._boundary_handling = LatticeBoltzmannBoundaryHandling(self.method, self._data_handling, self._pdf_arr_name,
name=name + "_boundary_handling", name=name + "_boundary_handling",
flag_interface=flag_interface, flag_interface=flag_interface,
...@@ -223,16 +226,42 @@ class LatticeBoltzmannStep: ...@@ -223,16 +226,42 @@ class LatticeBoltzmannStep:
def time_step(self): def time_step(self):
if len(self._lbmKernels) == 2: # collide stream if len(self._lbmKernels) == 2: # collide stream
self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params) self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
self._sync() self._sync_src()
self._boundary_handling(**self.kernel_params) self._boundary_handling(**self.kernel_params)
self._data_handling.run_kernel(self._lbmKernels[1], **self.kernel_params) self._data_handling.run_kernel(self._lbmKernels[1], **self.kernel_params)
else: # stream collide else: # stream collide
self._sync() self._sync_src()
self._boundary_handling(**self.kernel_params) self._boundary_handling(**self.kernel_params)
self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params) self._data_handling.run_kernel(self._lbmKernels[0], **self.kernel_params)
self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu) self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
self.time_steps_run += 1
def get_time_loop(self):
self.pre_run() # make sure GPU arrays are allocated
fixed_loop = TimeLoop(steps=2)
fixed_loop.add_pre_run_function(self.pre_run)
fixed_loop.add_post_run_function(self.post_run)
fixed_loop.add_single_step_function(self.time_step)
for t in range(2):
if len(self._lbmKernels) == 2: # collide stream
collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
fixed_loop.add_call(self._lbmKernels[0], collide_args)
fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
stream_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[1], **self.kernel_params)
fixed_loop.add_call(self._lbmKernels[1], stream_args)
else: # stream collide
fixed_loop.add_call(self._sync_src if t == 0 else self._sync_tmp, {})
self._boundary_handling.add_fixed_steps(fixed_loop, **self.kernel_params)
stream_collide_args = self._data_handling.get_kernel_kwargs(self._lbmKernels[0], **self.kernel_params)
fixed_loop.add_call(self._lbmKernels[0], stream_collide_args)
self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, self._gpu)
return fixed_loop
def post_run(self): def post_run(self):
if self._gpu: if self._gpu:
...@@ -240,24 +269,31 @@ class LatticeBoltzmannStep: ...@@ -240,24 +269,31 @@ class LatticeBoltzmannStep:
self._data_handling.run_kernel(self._getterKernel, **self.kernel_params) self._data_handling.run_kernel(self._getterKernel, **self.kernel_params)
def run(self, time_steps): def run(self, time_steps):
time_loop = self.get_time_loop()
time_loop.run(time_steps)
self.time_steps_run += time_loop.time_steps_run
def run_old(self, time_steps):
self.pre_run() self.pre_run()
for i in range(time_steps): for i in range(time_steps):
self.time_step() self.time_step()
self.post_run() self.post_run()
self.time_steps_run += time_steps
def benchmark_run(self, time_steps): def benchmark_run(self, time_steps):
time_loop = TimeLoop() time_loop = self.get_time_loop()
time_loop.add_step(self)
duration_of_time_step = time_loop.benchmark_run(time_steps) duration_of_time_step = time_loop.benchmark_run(time_steps)
mlups = self.number_of_cells / duration_of_time_step * 1e-6 mlups = self.number_of_cells / duration_of_time_step * 1e-6
self.time_steps_run += time_loop.time_steps_run
return mlups return mlups
def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'): def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'):
time_loop = TimeLoop() time_loop = self.get_time_loop()
time_loop.add_step(self)
duration_of_time_step = time_loop.benchmark(time_for_benchmark, init_time_steps, duration_of_time_step = time_loop.benchmark(time_for_benchmark, init_time_steps,
number_of_time_steps_for_estimation) number_of_time_steps_for_estimation)
mlups = self.number_of_cells / duration_of_time_step * 1e-6 mlups = self.number_of_cells / duration_of_time_step * 1e-6
self.time_steps_run += time_loop.time_steps_run
return mlups return mlups
def write_vtk(self): def write_vtk(self):
...@@ -328,7 +364,7 @@ class LatticeBoltzmannStep: ...@@ -328,7 +364,7 @@ class LatticeBoltzmannStep:
self._data_handling.all_to_gpu() self._data_handling.all_to_gpu()
for i in range(check_residuum_after): for i in range(check_residuum_after):
steps_run += 1 steps_run += 1
self._sync() self._sync_src()
self._boundary_handling(**self.kernel_params) self._boundary_handling(**self.kernel_params)
self._data_handling.run_kernel(self._velocity_init_kernel, **self.kernel_params) self._data_handling.run_kernel(self._velocity_init_kernel, **self.kernel_params)
self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, gpu=gpu) self._data_handling.swap(self._pdf_arr_name, self._tmp_arr_name, gpu=gpu)
......
import functools
from copy import deepcopy from copy import deepcopy
from pystencils.field import Field, get_layout_of_array from pystencils.field import Field, get_layout_of_array
from lbmpy.simplificationfactory import create_simplification_strategy from lbmpy.simplificationfactory import create_simplification_strategy
...@@ -7,12 +8,15 @@ def compile_macroscopic_values_getter(lb_method, output_quantities, pdf_arr=None ...@@ -7,12 +8,15 @@ def compile_macroscopic_values_getter(lb_method, output_quantities, pdf_arr=None
""" """
Create kernel to compute macroscopic value(s) from a pdf field (e.g. density or velocity) Create kernel to compute macroscopic value(s) from a pdf field (e.g. density or velocity)
:param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` Args:
:param output_quantities: sequence of quantities to compute e.g. ['density', 'velocity'] lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
:param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel output_quantities: sequence of quantities to compute e.g. ['density', 'velocity']
:param field_layout: layout for output field, also used for pdf field if pdf_arr is not given pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
:param target: 'cpu' or 'gpu' field_layout: layout for output field, also used for pdf field if pdf_arr is not given
:return: a function to compute macroscopic values: target: 'cpu' or 'gpu'
Returns:
a function to compute macroscopic values:
- pdf_array - pdf_array
- keyword arguments from name of conserved quantity (as in output_quantities) to numpy field - keyword arguments from name of conserved quantity (as in output_quantities) to numpy field
""" """
...@@ -83,12 +87,15 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None ...@@ -83,12 +87,15 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
Creates a function that sets a pdf field to specified macroscopic quantities Creates a function that sets a pdf field to specified macroscopic quantities
The returned function can be called with the pdf field to set as single argument The returned function can be called with the pdf field to set as single argument
:param lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod` Args:
:param quantities_to_set: map from conserved quantity name to fixed value or numpy array lb_method: instance of :class:`lbmpy.methods.AbstractLbMethod`
:param pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel quantities_to_set: map from conserved quantity name to fixed value or numpy array
:param field_layout: layout of the pdf field if pdf_arr was not given pdf_arr: optional numpy array for pdf field - used to get optimal loop structure for kernel
:param target: 'cpu' or 'gpu' field_layout: layout of the pdf field if pdf_arr was not given
:return: function taking pdf array as single argument and which sets the field to the given values target: 'cpu' or 'gpu'
Returns:
function taking pdf array as single argument and which sets the field to the given values
""" """
if pdf_arr is not None: if pdf_arr is not None:
pdf_field = Field.create_from_numpy_array('pdfs', pdf_arr, index_dimensions=1) pdf_field = Field.create_from_numpy_array('pdfs', pdf_arr, index_dimensions=1)
...@@ -128,10 +135,12 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None ...@@ -128,10 +135,12 @@ def compile_macroscopic_values_setter(lb_method, quantities_to_set, pdf_arr=None
if target == 'cpu': if target == 'cpu':
import pystencils.cpu as cpu import pystencils.cpu as cpu
kernel = cpu.make_python_function(cpu.create_kernel(eq), argument_dict=fixed_kernel_parameters) kernel = cpu.make_python_function(cpu.create_kernel(eq))
kernel = functools.partial(kernel, **fixed_kernel_parameters)
elif target == 'gpu': elif target == 'gpu':
import pystencils.gpucuda as gpu import pystencils.gpucuda as gpu
kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq), argument_dict=fixed_kernel_parameters) kernel = gpu.make_python_function(gpu.create_cuda_kernel(eq))
kernel = functools.partial(kernel, **fixed_kernel_parameters)
else: else:
raise ValueError("Unknown target '%s'. Possible targets are 'cpu' and 'gpu'" % (target,)) raise ValueError("Unknown target '%s'. Possible targets are 'cpu' and 'gpu'" % (target,))
...@@ -153,7 +162,7 @@ def create_advanced_velocity_setter_collision_rule(method, velocity_field: Field ...@@ -153,7 +162,7 @@ def create_advanced_velocity_setter_collision_rule(method, velocity_field: Field
method: lattice boltzmann method object method: lattice boltzmann method object
velocity_field: pystencils field velocity_field: pystencils field
velocity_relaxation_rate: relaxation rate for the velocity moments - determines convergence behaviour velocity_relaxation_rate: relaxation rate for the velocity moments - determines convergence behaviour
of the initialization scheme of the initialization scheme
Returns: Returns:
LB collision rule LB collision rule
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment