Skip to content
Snippets Groups Projects

[BugFix] Fix missing comunicated PDFs for FreeSlip

Merged Markus Holzer requested to merge holzer/lbmpy:FixComm into master
1 file
+ 111
129
Compare changes
  • Side-by-side
  • Inline
import itertools
from pystencils import Field, Assignment
from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
numeric_offsets, Timestep, get_timesteps
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, Timestep, get_timesteps
from pystencils.datahandling import SerialDataHandling
from pystencils.enums import Target
from itertools import chain
def _trim_slice_in_direction(slices, direction):
assert len(slices) == len(direction)
class LBMPeriodicityHandling:
result = []
for s, d in zip(slices, direction):
if isinstance(s, int):
result.append(s)
continue
start = s.start + 1 if d == -1 else s.start
stop = s.stop - 1 if d == 1 else s.stop
result.append(slice(start, stop, s.step))
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1,
pycuda_direct_copy=True):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
return tuple(result)
**On the usage with cuda:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
"""
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!')
self.stencil = stencil
self.dim = stencil.D
self.dh = data_handling
def _extend_dir(direction):
if len(direction) == 0:
yield tuple()
elif direction[0] == 0:
for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]):
yield (d, ) + rest
else:
for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest
assert data_handling.default_target in [Target.CPU, Target.GPU]
self.target = data_handling.default_target
self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
self.periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern)
def _get_neighbour_transform(direction, ghost_layers):
return tuple(d * (ghost_layers + 1) for d in direction)
self.cpu = self.target == Target.CPU
self.pycuda_direct_copy = self.target == Target.GPU and pycuda_direct_copy
def is_copy_direction(direction):
s = 0
for d, p in zip(direction, self.periodicity):
s += abs(d)
if d != 0 and not p:
return False
def _fix_length_one_slices(slices):
"""Slices of length one are replaced by their start value for correct periodic shifting"""
if isinstance(slices, int):
return slices
elif isinstance(slices, slice):
if slices.stop is not None and abs(slices.start - slices.stop) == 1:
return slices.start
elif slices.stop is None and slices.start == -1:
return -1 # [-1:] also has length one
return s != 0
full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
copy_directions = tuple(filter(is_copy_direction, full_stencil))
self.comm_slices = []
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if self.target == Target.GPU and not pycuda_direct_copy:
self.device_copy_kernels = list()
for timestep in timesteps:
self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
def __call__(self, prev_timestep=Timestep.BOTH):
if self.cpu:
self._periodicity_handling_cpu(prev_timestep)
else:
return slices
else:
return tuple(_fix_length_one_slices(s) for s in slices)
self._periodicity_handling_gpu(prev_timestep)
def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name]
comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices:
arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
else:
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
kernel(**kernel_args)
def get_communication_slices(
@@ -60,7 +104,7 @@ def get_communication_slices(
Return the source and destination slices for periodicity handling or communication between blocks.
:param stencil: The stencil used by the LB method.
:param comm_stencil: The stencil defining the communication directions. If None, it will be set to the
:param comm_stencil: The stencil defining the communication directions. If None, it will be set to the
full stencil (D2Q9 in 2D, D3Q27 in 3D, etc.).
:param streaming_pattern: The streaming pattern.
:param prev_timestep: Timestep after which communication is run.
@@ -83,13 +127,10 @@ def get_communication_slices(
for streaming_dir in set(_extend_dir(comm_dir)) & set(stencil):
d = stencil.index(streaming_dir)
write_offsets = numeric_offsets(write_accesses[d])
write_index = numeric_index(write_accesses[d])[0]
tangential_dir = tuple(s - c for s, c in zip(streaming_dir, comm_dir))
origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1)
origin_slice = _fix_length_one_slices(origin_slice)
src_slice = shift_slice(_trim_slice_in_direction(origin_slice, tangential_dir), write_offsets)
src_slice = _fix_length_one_slices(origin_slice)
neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers)
dst_slice = shift_slice(src_slice, neighbour_transform)
@@ -130,8 +171,9 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
copy_eq = Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
ast = create_cuda_kernel([copy_eq], iteration_slice=dst_slice, skip_independence_check=True)
copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
ast = create_cuda_kernel(copy_eq, config=config)
if target == Target.GPU:
from pystencils.gpucuda import make_python_function
return make_python_function(ast)
@@ -139,92 +181,32 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
raise ValueError('Invalid target:', target)
class LBMPeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1,
pycuda_direct_copy=True):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
"""
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!')
self.stencil = stencil
self.dim = stencil.D
self.dh = data_handling
target = data_handling.default_target
assert target in [Target.CPU, Target.GPU]
self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern)
self.target = target
self.cpu = target == Target.CPU
self.pycuda_direct_copy = target == Target.GPU and pycuda_direct_copy
def is_copy_direction(direction):
s = 0
for d, p in zip(direction, periodicity):
s += abs(d)
if d != 0 and not p:
return False
return s != 0
full_stencil = itertools.product(*([-1, 0, 1] for _ in range(self.dim)))
copy_directions = tuple(filter(is_copy_direction, full_stencil))
self.comm_slices = []
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if target == Target.GPU and not pycuda_direct_copy:
self.device_copy_kernels = []
for timestep in timesteps:
self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
def _extend_dir(direction):
if len(direction) == 0:
yield tuple()
elif direction[0] == 0:
for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]):
yield (d, ) + rest
else:
for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest
def __call__(self, prev_timestep=Timestep.BOTH):
if self.cpu:
self._periodicity_handling_cpu(prev_timestep)
else:
self._periodicity_handling_gpu(prev_timestep)
def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name]
comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices:
arr[dst] = arr[src]
def _get_neighbour_transform(direction, ghost_layers):
return tuple(d * (ghost_layers + 1) for d in direction)
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
def _fix_length_one_slices(slices):
"""Slices of length one are replaced by their start value for correct periodic shifting"""
if isinstance(slices, int):
return slices
elif isinstance(slices, slice):
if slices.stop is not None and abs(slices.start - slices.stop) == 1:
return slices.start
elif slices.stop is None and slices.start == -1:
return -1 # [-1:] also has length one
else:
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
kernel(**kernel_args)
return slices
else:
return tuple(_fix_length_one_slices(s) for s in slices)
Loading