Skip to content
Snippets Groups Projects

[BugFix] Fix missing comunicated PDFs for FreeSlip

Merged Markus Holzer requested to merge holzer/lbmpy:FixComm into master
Viewing commit 102281ae
Prev
Show latest version
1 file
+ 108
123
Preferences
Compare changes
import itertools
from pystencils import Field, Assignment
from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, Timestep, get_timesteps
from pystencils.datahandling import SerialDataHandling
@@ -7,50 +7,95 @@ from pystencils.enums import Target
from itertools import chain
def _trim_slice_in_direction(slices, direction):
assert len(slices) == len(direction)
class LBMPeriodicityHandling:
result = []
for s, d in zip(slices, direction):
if isinstance(s, int):
result.append(s)
continue
start = s.start + 1 if d == -1 else s.start
stop = s.stop - 1 if d == 1 else s.stop
result.append(slice(start, stop, s.step))
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1,
pycuda_direct_copy=True):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
return tuple(result)
**On the usage with cuda:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
"""
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!')
self.stencil = stencil
self.dim = stencil.D
self.dh = data_handling
def _extend_dir(direction):
if len(direction) == 0:
yield tuple()
elif direction[0] == 0:
for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]):
yield (d, ) + rest
else:
for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest
assert data_handling.default_target in [Target.CPU, Target.GPU]
self.target = data_handling.default_target
self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
self.periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern)
def _get_neighbour_transform(direction, ghost_layers):
return tuple(d * (ghost_layers + 1) for d in direction)
self.cpu = self.target == Target.CPU
self.pycuda_direct_copy = self.target == Target.GPU and pycuda_direct_copy
def is_copy_direction(direction):
s = 0
for d, p in zip(direction, self.periodicity):
s += abs(d)
if d != 0 and not p:
return False
def _fix_length_one_slices(slices):
"""Slices of length one are replaced by their start value for correct periodic shifting"""
if isinstance(slices, int):
return slices
elif isinstance(slices, slice):
if slices.stop is not None and abs(slices.start - slices.stop) == 1:
return slices.start
elif slices.stop is None and slices.start == -1:
return -1 # [-1:] also has length one
return s != 0
full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
copy_directions = tuple(filter(is_copy_direction, full_stencil))
self.comm_slices = []
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if self.target == Target.GPU and not pycuda_direct_copy:
self.device_copy_kernels = list()
for timestep in timesteps:
self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
def __call__(self, prev_timestep=Timestep.BOTH):
if self.cpu:
self._periodicity_handling_cpu(prev_timestep)
else:
return slices
else:
return tuple(_fix_length_one_slices(s) for s in slices)
self._periodicity_handling_gpu(prev_timestep)
def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name]
comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices:
arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
else:
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
kernel(**kernel_args)
def get_communication_slices(
@@ -110,7 +155,6 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
src_slice = src_slice[:-1]
dst_slice = dst_slice[:-1]
# TODO this is the domain_size with GL
if domain_size is None:
domain_size = pdf_field.spatial_shape
@@ -127,8 +171,9 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
copy_eq = Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
ast = create_cuda_kernel([copy_eq], iteration_slice=dst_slice, skip_independence_check=True)
copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
ast = create_cuda_kernel(copy_eq, config=config)
if target == Target.GPU:
from pystencils.gpucuda import make_python_function
return make_python_function(ast)
@@ -136,92 +181,32 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
raise ValueError('Invalid target:', target)
class LBMPeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1,
pycuda_direct_copy=True):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
"""
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!')
self.stencil = stencil
self.dim = stencil.D
self.dh = data_handling
target = data_handling.default_target
assert target in [Target.CPU, Target.GPU]
self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern)
self.target = target
self.cpu = target == Target.CPU
self.pycuda_direct_copy = target == Target.GPU and pycuda_direct_copy
def is_copy_direction(direction):
s = 0
for d, p in zip(direction, periodicity):
s += abs(d)
if d != 0 and not p:
return False
return s != 0
full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
copy_directions = tuple(filter(is_copy_direction, full_stencil))
self.comm_slices = []
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if target == Target.GPU and not pycuda_direct_copy:
self.device_copy_kernels = []
for timestep in timesteps:
self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
def _extend_dir(direction):
if len(direction) == 0:
yield tuple()
elif direction[0] == 0:
for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]):
yield (d, ) + rest
else:
for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest
def __call__(self, prev_timestep=Timestep.BOTH):
if self.cpu:
self._periodicity_handling_cpu(prev_timestep)
else:
self._periodicity_handling_gpu(prev_timestep)
def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name]
comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices:
arr[dst] = arr[src]
def _get_neighbour_transform(direction, ghost_layers):
return tuple(d * (ghost_layers + 1) for d in direction)
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
def _fix_length_one_slices(slices):
"""Slices of length one are replaced by their start value for correct periodic shifting"""
if isinstance(slices, int):
return slices
elif isinstance(slices, slice):
if slices.stop is not None and abs(slices.start - slices.stop) == 1:
return slices.start
elif slices.stop is None and slices.start == -1:
return -1 # [-1:] also has length one
else:
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
kernel(**kernel_args)
return slices
else:
return tuple(_fix_length_one_slices(s) for s in slices)