diff --git a/src/lbmpy/advanced_streaming/communication.py b/src/lbmpy/advanced_streaming/communication.py index 412247bb409ad98973a085b37de4da5c3c737fae..c02b6e2229d3f1a3da0e81cfdd70a6e5c5ff5e87 100644 --- a/src/lbmpy/advanced_streaming/communication.py +++ b/src/lbmpy/advanced_streaming/communication.py @@ -1,8 +1,18 @@ import itertools from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection -from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice -from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \ - Timestep, get_timesteps, numeric_offsets +from pystencils.slicing import ( + shift_slice, + get_slice_before_ghost_layer, + normalize_slice, +) +from lbmpy.advanced_streaming.utility import ( + is_inplace, + get_accessor, + numeric_index, + Timestep, + get_timesteps, + numeric_offsets, +) from pystencils.datahandling import SerialDataHandling from pystencils.enums import Target from itertools import chain @@ -10,22 +20,28 @@ from itertools import chain class LBMPeriodicityHandling: - def __init__(self, stencil, data_handling, pdf_field_name, - streaming_pattern='pull', ghost_layers=1, - cupy_direct_copy=True): + def __init__( + self, + stencil, + data_handling, + pdf_field_name, + streaming_pattern="pull", + ghost_layers=1, + cupy_direct_copy=True, + ): """ - Periodicity Handling for Lattice Boltzmann Streaming. - - **On the usage with cuda:** - - cupy allows the copying of sliced arrays within device memory using the numpy syntax, - e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity - handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and - compiled. The compiled kernels are almost twice as fast in execution as cupy array copying, - but especially for large stencils like D3Q27, their compilation can take up to 20 seconds. - Choose your weapon depending on your use case. + Periodicity Handling for Lattice Boltzmann Streaming. + + **On the usage with cuda:** + - cupy allows the copying of sliced arrays within device memory using the numpy syntax, + e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity + handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and + compiled. The compiled kernels are almost twice as fast in execution as cupy array copying, + but especially for large stencils like D3Q27, their compilation can take up to 20 seconds. + Choose your weapon depending on your use case. """ if not isinstance(data_handling, SerialDataHandling): - raise ValueError('Only serial data handling is supported!') + raise ValueError("Only serial data handling is supported!") self.stencil = stencil self.dim = stencil.D @@ -56,12 +72,16 @@ class LBMPeriodicityHandling: self.comm_slices = [] timesteps = get_timesteps(streaming_pattern) for timestep in timesteps: - slices_per_comm_dir = get_communication_slices(stencil=stencil, - comm_stencil=copy_directions, - streaming_pattern=streaming_pattern, - prev_timestep=timestep, - ghost_layers=ghost_layers) - self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))) + slices_per_comm_dir = get_communication_slices( + stencil=stencil, + comm_stencil=copy_directions, + streaming_pattern=streaming_pattern, + prev_timestep=timestep, + ghost_layers=ghost_layers, + ) + self.comm_slices.append( + list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())) + ) if self.target == Target.GPU and not cupy_direct_copy: self.device_copy_kernels = list() @@ -81,11 +101,11 @@ class LBMPeriodicityHandling: arr[dst] = arr[src] def _compile_copy_kernels(self, timestep): + assert self.target == Target.GPU pdf_field = self.dh.fields[self.pdf_field_name] kernels = [] for src, dst in self.comm_slices[timestep.idx]: - kernels.append( - periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target)) + kernels.append(periodic_pdf_gpu_copy_kernel(pdf_field, src, dst)) return kernels def _periodicity_handling_gpu(self, prev_timestep): @@ -100,7 +120,12 @@ class LBMPeriodicityHandling: def get_communication_slices( - stencil, comm_stencil=None, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1): + stencil, + comm_stencil=None, + streaming_pattern="pull", + prev_timestep=Timestep.BOTH, + ghost_layers=1, +): """ Return the source and destination slices for periodicity handling or communication between blocks. @@ -116,7 +141,9 @@ def get_communication_slices( if comm_stencil is None: comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D))) - pdfs = Field.create_generic('pdfs', spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)) + pdfs = Field.create_generic( + "pdfs", spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,) + ) write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil) slices_per_comm_direction = dict() @@ -130,7 +157,9 @@ def get_communication_slices( d = stencil.index(streaming_dir) write_index = numeric_index(write_accesses[d])[0] - origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1) + origin_slice = get_slice_before_ghost_layer( + comm_dir, ghost_layers=ghost_layers, thickness=1 + ) src_slice = _fix_length_one_slices(origin_slice) write_offsets = numeric_offsets(write_accesses[d]) @@ -138,13 +167,15 @@ def get_communication_slices( # TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side if streaming_pattern != "pull": - src_slice = shift_slice(_trim_slice_in_direction(src_slice, tangential_dir), write_offsets) + src_slice = shift_slice( + _trim_slice_in_direction(src_slice, tangential_dir), write_offsets + ) neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers) dst_slice = shift_slice(src_slice, neighbour_transform) - src_slice = src_slice + (write_index, ) - dst_slice = dst_slice + (write_index, ) + src_slice = src_slice + (write_index,) + dst_slice = dst_slice + (write_index,) slices_for_dir.append((src_slice, dst_slice)) @@ -152,10 +183,10 @@ def get_communication_slices( return slices_per_comm_direction -def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice, - domain_size=None, target=Target.GPU): - """Copies a rectangular array slice onto another non-overlapping array slice""" - from pystencils.gpucuda.kernelcreation import create_cuda_kernel +def periodic_pdf_gpu_copy_kernel(pdf_field, src_slice, dst_slice, domain_size=None): + """Generate a GPU kernel which copies all values from one slice of a field + to another non-overlapping slice.""" + from pystencils import create_kernel pdf_idx = src_slice[-1] assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant" @@ -176,18 +207,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice, def _stop(s): return s.stop if isinstance(s, slice) else s - offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)] - assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \ - "Slices have to have same size" - - copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))]) - config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True) - ast = create_cuda_kernel(copy_eq, config=config) - if target == Target.GPU: - from pystencils.gpucuda import make_python_function - return make_python_function(ast) - else: - raise ValueError('Invalid target:', target) + offset = [ + _start(s1) - _start(s2) + for s1, s2 in zip(normalized_from_slice, normalized_to_slice) + ] + assert offset == [ + _stop(s1) - _stop(s2) + for s1, s2 in zip(normalized_from_slice, normalized_to_slice) + ], "Slices have to have same size" + + copy_eq = AssignmentCollection( + main_assignments=[ + Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx)) + ] + ) + config = CreateKernelConfig( + iteration_slice=dst_slice, + skip_independence_check=True, + target=Target.GPU, + ) + + ast = create_kernel(copy_eq, config=config) + return ast.compile() def _extend_dir(direction): @@ -196,10 +237,10 @@ def _extend_dir(direction): elif direction[0] == 0: for d in [-1, 0, 1]: for rest in _extend_dir(direction[1:]): - yield (d, ) + rest + yield (d,) + rest else: for rest in _extend_dir(direction[1:]): - yield (direction[0], ) + rest + yield (direction[0],) + rest def _get_neighbour_transform(direction, ghost_layers): diff --git a/src/lbmpy/moment_transforms/rawmomenttransforms.py b/src/lbmpy/moment_transforms/rawmomenttransforms.py index 11013841d0cda6cec0f57bdf5da40cbfd2fc302d..63952347c1075aab1bdfe473ecd92679d466f986 100644 --- a/src/lbmpy/moment_transforms/rawmomenttransforms.py +++ b/src/lbmpy/moment_transforms/rawmomenttransforms.py @@ -172,7 +172,7 @@ class PdfsToMomentsByMatrixTransform(AbstractRawMomentTransform): # ----------------------------- Private Members ----------------------------- - @ property + @property def _default_simplification(self): forward_simp = SimplificationStrategy() # forward_simp.add(substitute_moments_in_conserved_quantity_equations) @@ -218,7 +218,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform): self.moment_polynomials) self.poly_to_mono_matrix = self.mono_to_poly_matrix.inv() - @ property + @property def absorbs_conserved_quantity_equations(self): return True @@ -414,7 +414,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform): # ----------------------------- Private Members ----------------------------- - @ property + @property def _default_simplification(self): from lbmpy.methods.momentbased.momentbasedsimplifications import ( substitute_moments_in_conserved_quantity_equations, diff --git a/tests/advanced_streaming/test_communication.py b/tests/advanced_streaming/test_communication.py index dddeeea69b1957691acd89bc0d8652f99d628673..9328e8963e42dac8b681647e5f72d575d20049ef 100644 --- a/tests/advanced_streaming/test_communication.py +++ b/tests/advanced_streaming/test_communication.py @@ -5,36 +5,52 @@ import numpy as np from lbmpy.stencils import LBStencil from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation -from lbmpy.advanced_streaming.communication import get_communication_slices, _fix_length_one_slices, \ - LBMPeriodicityHandling +from lbmpy.advanced_streaming.communication import ( + get_communication_slices, + _fix_length_one_slices, + LBMPeriodicityHandling, + periodic_pdf_gpu_copy_kernel, +) from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep from lbmpy.enums import Stencil import pytest -@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) -@pytest.mark.parametrize('streaming_pattern', streaming_patterns) -@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD]) +@pytest.mark.parametrize( + "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27] +) +@pytest.mark.parametrize("streaming_pattern", streaming_patterns) +@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD]) def test_slices_not_empty(stencil, streaming_pattern, timestep): stencil = LBStencil(stencil) arr = np.zeros((4,) * stencil.D + (stencil.Q,)) - slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep, - ghost_layers=1) + slices = get_communication_slices( + stencil, + streaming_pattern=streaming_pattern, + prev_timestep=timestep, + ghost_layers=1, + ) for _, slices_list in slices.items(): for src, dst in slices_list: assert all(s != 0 for s in arr[src].shape) assert all(s != 0 for s in arr[dst].shape) -@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) -@pytest.mark.parametrize('streaming_pattern', streaming_patterns) -@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD]) +@pytest.mark.parametrize( + "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27] +) +@pytest.mark.parametrize("streaming_pattern", streaming_patterns) +@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD]) def test_src_dst_same_shape(stencil, streaming_pattern, timestep): stencil = LBStencil(stencil) arr = np.zeros((4,) * stencil.D + (stencil.Q,)) - slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep, - ghost_layers=1) + slices = get_communication_slices( + stencil, + streaming_pattern=streaming_pattern, + prev_timestep=timestep, + ghost_layers=1, + ) for _, slices_list in slices.items(): for src, dst in slices_list: src_shape = arr[src].shape @@ -42,12 +58,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep): assert src_shape == dst_shape -@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) +@pytest.mark.parametrize( + "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27] +) def test_pull_communication_slices(stencil): stencil = LBStencil(stencil) slices = get_communication_slices( - stencil, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1) + stencil, streaming_pattern="pull", prev_timestep=Timestep.BOTH, ghost_layers=1 + ) for i, d in enumerate(stencil): if i == 0: continue @@ -58,21 +77,115 @@ def test_pull_communication_slices(stencil): dst = s[1][:-1] break - inner_slice = _fix_length_one_slices(get_slice_before_ghost_layer(d, ghost_layers=1)) + inner_slice = _fix_length_one_slices( + get_slice_before_ghost_layer(d, ghost_layers=1) + ) inv_dir = (-e for e in d) - gl_slice = _fix_length_one_slices(get_ghost_region_slice(inv_dir, ghost_layers=1)) + gl_slice = _fix_length_one_slices( + get_ghost_region_slice(inv_dir, ghost_layers=1) + ) assert src == inner_slice assert dst == gl_slice -@pytest.mark.parametrize('stencil_name', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) +@pytest.mark.parametrize("direction", LBStencil(Stencil.D3Q27).stencil_entries) +@pytest.mark.parametrize("pull", [False, True]) +def test_gpu_comm_kernels(direction: tuple, pull: bool): + pytest.importorskip("cupy") + + stencil = LBStencil(Stencil.D3Q27) + inv_dir = stencil[stencil.inverse_index(direction)] + target = ps.Target.GPU + + domain_size = (4,) * stencil.D + + dh: ps.datahandling.SerialDataHandling = ps.create_data_handling( + domain_size, + periodicity=(True,) * stencil.D, + parallel=False, + default_target=target, + ) + + field = dh.add_array("field", values_per_cell=2) + + if pull: + dst_slice = get_ghost_region_slice(inv_dir) + src_slice = get_slice_before_ghost_layer(direction) + else: + dst_slice = get_slice_before_ghost_layer(direction) + src_slice = get_ghost_region_slice(inv_dir) + + src_slice += (1,) + dst_slice += (1,) + + kernel = periodic_pdf_gpu_copy_kernel(field, src_slice, dst_slice) + + dh.cpu_arrays[field.name][src_slice] = 42.0 + dh.all_to_gpu() + + dh.run_kernel(kernel) + + dh.all_to_cpu() + np.testing.assert_equal(dh.cpu_arrays[field.name][dst_slice], 42.0) + + +@pytest.mark.parametrize("stencil", [Stencil.D2Q9, Stencil.D3Q19]) +@pytest.mark.parametrize("streaming_pattern", streaming_patterns) +def test_direct_copy_and_kernels_equivalence(stencil: Stencil, streaming_pattern: str): + pytest.importorskip("cupy") + + target = ps.Target.GPU + stencil = LBStencil(stencil) + domain_size = (4,) * stencil.D + + dh: ps.datahandling.SerialDataHandling = ps.create_data_handling( + domain_size, + periodicity=(True,) * stencil.D, + parallel=False, + default_target=target, + ) + + pdfs_a = dh.add_array("pdfs_a", values_per_cell=stencil.Q) + pdfs_b = dh.add_array("pdfs_b", values_per_cell=stencil.Q) + + dh.fill(pdfs_a.name, 0.0, ghost_layers=True) + dh.fill(pdfs_b.name, 0.0, ghost_layers=True) + + for q in range(stencil.Q): + sl = ps.make_slice[:4, :4, q] if stencil.D == 2 else ps.make_slice[:4, :4, :4, q] + dh.cpu_arrays[pdfs_a.name][sl] = q + dh.cpu_arrays[pdfs_b.name][sl] = q + + dh.all_to_gpu() + + direct_copy = LBMPeriodicityHandling(stencil, dh, pdfs_a.name, streaming_pattern, cupy_direct_copy=True) + kernels_copy = LBMPeriodicityHandling(stencil, dh, pdfs_b.name, streaming_pattern, cupy_direct_copy=False) + + direct_copy(Timestep.EVEN) + kernels_copy(Timestep.EVEN) + + dh.all_to_cpu() + + np.testing.assert_equal( + dh.cpu_arrays[pdfs_a.name], + dh.cpu_arrays[pdfs_b.name] + ) + + +@pytest.mark.parametrize( + "stencil_name", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27] +) def test_optimised_and_full_communication_equivalence(stencil_name): target = ps.Target.CPU stencil = LBStencil(stencil_name) - domain_size = (4, ) * stencil.D + domain_size = (4,) * stencil.D - dh = ps.create_data_handling(domain_size, periodicity=(True, ) * stencil.D, - parallel=False, default_target=target) + dh = ps.create_data_handling( + domain_size, + periodicity=(True,) * stencil.D, + parallel=False, + default_target=target, + ) pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64) dh.fill("pdf", 0, ghost_layers=True) @@ -82,9 +195,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name): gl = dh.ghost_layers_of_field("pdf") num = 0 - for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']): - dh.cpu_arrays['pdf'][idx] = num - dh.cpu_arrays['pdf_tmp'][idx] = num + for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]): + dh.cpu_arrays["pdf"][idx] = num + dh.cpu_arrays["pdf_tmp"][idx] = num num += 1 lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only") @@ -95,21 +208,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name): ast = ps.create_kernel(ac, config=config) stream = ast.compile() - full_communication = dh.synchronization_function(pdf.name, target=dh.default_target, optimization={"openmp": True}) + full_communication = dh.synchronization_function( + pdf.name, target=dh.default_target, optimization={"openmp": True} + ) full_communication() dh.run_kernel(stream) dh.swap("pdf", "pdf_tmp") - pdf_full_communication = np.copy(dh.cpu_arrays['pdf']) + pdf_full_communication = np.copy(dh.cpu_arrays["pdf"]) num = 0 - for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']): - dh.cpu_arrays['pdf'][idx] = num - dh.cpu_arrays['pdf_tmp'][idx] = num + for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]): + dh.cpu_arrays["pdf"][idx] = num + dh.cpu_arrays["pdf_tmp"][idx] = num num += 1 - optimised_communication = LBMPeriodicityHandling(stencil=stencil, data_handling=dh, pdf_field_name=pdf.name, - streaming_pattern='pull') + optimised_communication = LBMPeriodicityHandling( + stencil=stencil, + data_handling=dh, + pdf_field_name=pdf.name, + streaming_pattern="pull", + ) optimised_communication() dh.run_kernel(stream) dh.swap("pdf", "pdf_tmp") @@ -119,9 +238,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name): for j in range(gl, domain_size[1]): for k in range(gl, domain_size[2]): for f in range(len(stencil)): - assert dh.cpu_arrays['pdf'][i, j, k, f] == pdf_full_communication[i, j, k, f], print(f) + assert ( + dh.cpu_arrays["pdf"][i, j, k, f] + == pdf_full_communication[i, j, k, f] + ), print(f) else: for i in range(gl, domain_size[0]): for j in range(gl, domain_size[1]): for f in range(len(stencil)): - assert dh.cpu_arrays['pdf'][i, j, f] == pdf_full_communication[i, j, f] + assert ( + dh.cpu_arrays["pdf"][i, j, f] == pdf_full_communication[i, j, f] + )