Skip to content
Snippets Groups Projects
Commit 019c73c6 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

remove ancient pystencils.gpucuda import. Write test case for cupy-copy/direct-copy equivalence.

parent eb626105
No related branches found
No related tags found
1 merge request!182Fix GPU copy kernels in periodicity handling
import itertools import itertools
from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice from pystencils.slicing import (
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \ shift_slice,
Timestep, get_timesteps, numeric_offsets get_slice_before_ghost_layer,
normalize_slice,
)
from lbmpy.advanced_streaming.utility import (
is_inplace,
get_accessor,
numeric_index,
Timestep,
get_timesteps,
numeric_offsets,
)
from pystencils.datahandling import SerialDataHandling from pystencils.datahandling import SerialDataHandling
from pystencils.enums import Target from pystencils.enums import Target
from itertools import chain from itertools import chain
...@@ -10,22 +20,28 @@ from itertools import chain ...@@ -10,22 +20,28 @@ from itertools import chain
class LBMPeriodicityHandling: class LBMPeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name, def __init__(
streaming_pattern='pull', ghost_layers=1, self,
cupy_direct_copy=True): stencil,
data_handling,
pdf_field_name,
streaming_pattern="pull",
ghost_layers=1,
cupy_direct_copy=True,
):
""" """
Periodicity Handling for Lattice Boltzmann Streaming. Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda:** **On the usage with cuda:**
- cupy allows the copying of sliced arrays within device memory using the numpy syntax, - cupy allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as cupy array copying, compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds. but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case. Choose your weapon depending on your use case.
""" """
if not isinstance(data_handling, SerialDataHandling): if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!') raise ValueError("Only serial data handling is supported!")
self.stencil = stencil self.stencil = stencil
self.dim = stencil.D self.dim = stencil.D
...@@ -56,12 +72,16 @@ class LBMPeriodicityHandling: ...@@ -56,12 +72,16 @@ class LBMPeriodicityHandling:
self.comm_slices = [] self.comm_slices = []
timesteps = get_timesteps(streaming_pattern) timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps: for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil, slices_per_comm_dir = get_communication_slices(
comm_stencil=copy_directions, stencil=stencil,
streaming_pattern=streaming_pattern, comm_stencil=copy_directions,
prev_timestep=timestep, streaming_pattern=streaming_pattern,
ghost_layers=ghost_layers) prev_timestep=timestep,
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))) ghost_layers=ghost_layers,
)
self.comm_slices.append(
list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
)
if self.target == Target.GPU and not cupy_direct_copy: if self.target == Target.GPU and not cupy_direct_copy:
self.device_copy_kernels = list() self.device_copy_kernels = list()
...@@ -81,11 +101,11 @@ class LBMPeriodicityHandling: ...@@ -81,11 +101,11 @@ class LBMPeriodicityHandling:
arr[dst] = arr[src] arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep): def _compile_copy_kernels(self, timestep):
assert self.target == Target.GPU
pdf_field = self.dh.fields[self.pdf_field_name] pdf_field = self.dh.fields[self.pdf_field_name]
kernels = [] kernels = []
for src, dst in self.comm_slices[timestep.idx]: for src, dst in self.comm_slices[timestep.idx]:
kernels.append( kernels.append(periodic_pdf_gpu_copy_kernel(pdf_field, src, dst))
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
return kernels return kernels
def _periodicity_handling_gpu(self, prev_timestep): def _periodicity_handling_gpu(self, prev_timestep):
...@@ -100,7 +120,12 @@ class LBMPeriodicityHandling: ...@@ -100,7 +120,12 @@ class LBMPeriodicityHandling:
def get_communication_slices( def get_communication_slices(
stencil, comm_stencil=None, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1): stencil,
comm_stencil=None,
streaming_pattern="pull",
prev_timestep=Timestep.BOTH,
ghost_layers=1,
):
""" """
Return the source and destination slices for periodicity handling or communication between blocks. Return the source and destination slices for periodicity handling or communication between blocks.
...@@ -116,7 +141,9 @@ def get_communication_slices( ...@@ -116,7 +141,9 @@ def get_communication_slices(
if comm_stencil is None: if comm_stencil is None:
comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D))) comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D)))
pdfs = Field.create_generic('pdfs', spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)) pdfs = Field.create_generic(
"pdfs", spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)
)
write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil) write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil)
slices_per_comm_direction = dict() slices_per_comm_direction = dict()
...@@ -130,7 +157,9 @@ def get_communication_slices( ...@@ -130,7 +157,9 @@ def get_communication_slices(
d = stencil.index(streaming_dir) d = stencil.index(streaming_dir)
write_index = numeric_index(write_accesses[d])[0] write_index = numeric_index(write_accesses[d])[0]
origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1) origin_slice = get_slice_before_ghost_layer(
comm_dir, ghost_layers=ghost_layers, thickness=1
)
src_slice = _fix_length_one_slices(origin_slice) src_slice = _fix_length_one_slices(origin_slice)
write_offsets = numeric_offsets(write_accesses[d]) write_offsets = numeric_offsets(write_accesses[d])
...@@ -138,13 +167,15 @@ def get_communication_slices( ...@@ -138,13 +167,15 @@ def get_communication_slices(
# TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side # TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side
if streaming_pattern != "pull": if streaming_pattern != "pull":
src_slice = shift_slice(_trim_slice_in_direction(src_slice, tangential_dir), write_offsets) src_slice = shift_slice(
_trim_slice_in_direction(src_slice, tangential_dir), write_offsets
)
neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers) neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers)
dst_slice = shift_slice(src_slice, neighbour_transform) dst_slice = shift_slice(src_slice, neighbour_transform)
src_slice = src_slice + (write_index, ) src_slice = src_slice + (write_index,)
dst_slice = dst_slice + (write_index, ) dst_slice = dst_slice + (write_index,)
slices_for_dir.append((src_slice, dst_slice)) slices_for_dir.append((src_slice, dst_slice))
...@@ -152,10 +183,11 @@ def get_communication_slices( ...@@ -152,10 +183,11 @@ def get_communication_slices(
return slices_per_comm_direction return slices_per_comm_direction
def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice, def periodic_pdf_gpu_copy_kernel(pdf_field, src_slice, dst_slice, domain_size=None):
domain_size=None, target=Target.GPU): """Generate a GPU kernel which copies all values from one slice of a field
"""Copies a rectangular array slice onto another non-overlapping array slice""" to another non-overlapping slice."""
from pystencils.gpucuda.kernelcreation import create_cuda_kernel # from pystencils.gpu.kernelcreation import create_cuda_kernel
from pystencils import create_kernel
pdf_idx = src_slice[-1] pdf_idx = src_slice[-1]
assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant" assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
...@@ -174,20 +206,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice, ...@@ -174,20 +206,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
return s.start if isinstance(s, slice) else s return s.start if isinstance(s, slice) else s
def _stop(s): def _stop(s):
return s.stop if isinstance(s, slice) else s return s.stop if isinstance(s, slice) else s + 1
offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)] offset = [
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \ _start(s1) - _start(s2)
"Slices have to have same size" for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
]
copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))]) assert offset == [
config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True) _stop(s1) - _stop(s2)
ast = create_cuda_kernel(copy_eq, config=config) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
if target == Target.GPU: ], "Slices have to have same size"
from pystencils.gpucuda import make_python_function
return make_python_function(ast) copy_eq = AssignmentCollection(
else: main_assignments=[
raise ValueError('Invalid target:', target) Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
]
)
config = CreateKernelConfig(
iteration_slice=dst_slice, skip_independence_check=True, target=Target.GPU
)
ast = create_kernel(copy_eq, config=config)
return ast.compile()
def _extend_dir(direction): def _extend_dir(direction):
...@@ -196,10 +236,10 @@ def _extend_dir(direction): ...@@ -196,10 +236,10 @@ def _extend_dir(direction):
elif direction[0] == 0: elif direction[0] == 0:
for d in [-1, 0, 1]: for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]): for rest in _extend_dir(direction[1:]):
yield (d, ) + rest yield (d,) + rest
else: else:
for rest in _extend_dir(direction[1:]): for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest yield (direction[0],) + rest
def _get_neighbour_transform(direction, ghost_layers): def _get_neighbour_transform(direction, ghost_layers):
......
...@@ -5,36 +5,51 @@ import numpy as np ...@@ -5,36 +5,51 @@ import numpy as np
from lbmpy.stencils import LBStencil from lbmpy.stencils import LBStencil
from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice
from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation
from lbmpy.advanced_streaming.communication import get_communication_slices, _fix_length_one_slices, \ from lbmpy.advanced_streaming.communication import (
LBMPeriodicityHandling get_communication_slices,
_fix_length_one_slices,
LBMPeriodicityHandling,
)
from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep
from lbmpy.enums import Stencil from lbmpy.enums import Stencil
import pytest import pytest
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) @pytest.mark.parametrize(
@pytest.mark.parametrize('streaming_pattern', streaming_patterns) "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD]) )
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
def test_slices_not_empty(stencil, streaming_pattern, timestep): def test_slices_not_empty(stencil, streaming_pattern, timestep):
stencil = LBStencil(stencil) stencil = LBStencil(stencil)
arr = np.zeros((4,) * stencil.D + (stencil.Q,)) arr = np.zeros((4,) * stencil.D + (stencil.Q,))
slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep, slices = get_communication_slices(
ghost_layers=1) stencil,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=1,
)
for _, slices_list in slices.items(): for _, slices_list in slices.items():
for src, dst in slices_list: for src, dst in slices_list:
assert all(s != 0 for s in arr[src].shape) assert all(s != 0 for s in arr[src].shape)
assert all(s != 0 for s in arr[dst].shape) assert all(s != 0 for s in arr[dst].shape)
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) @pytest.mark.parametrize(
@pytest.mark.parametrize('streaming_pattern', streaming_patterns) "stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD]) )
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
def test_src_dst_same_shape(stencil, streaming_pattern, timestep): def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
stencil = LBStencil(stencil) stencil = LBStencil(stencil)
arr = np.zeros((4,) * stencil.D + (stencil.Q,)) arr = np.zeros((4,) * stencil.D + (stencil.Q,))
slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep, slices = get_communication_slices(
ghost_layers=1) stencil,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=1,
)
for _, slices_list in slices.items(): for _, slices_list in slices.items():
for src, dst in slices_list: for src, dst in slices_list:
src_shape = arr[src].shape src_shape = arr[src].shape
...@@ -42,12 +57,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep): ...@@ -42,12 +57,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
assert src_shape == dst_shape assert src_shape == dst_shape
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) @pytest.mark.parametrize(
"stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
def test_pull_communication_slices(stencil): def test_pull_communication_slices(stencil):
stencil = LBStencil(stencil) stencil = LBStencil(stencil)
slices = get_communication_slices( slices = get_communication_slices(
stencil, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1) stencil, streaming_pattern="pull", prev_timestep=Timestep.BOTH, ghost_layers=1
)
for i, d in enumerate(stencil): for i, d in enumerate(stencil):
if i == 0: if i == 0:
continue continue
...@@ -58,21 +76,71 @@ def test_pull_communication_slices(stencil): ...@@ -58,21 +76,71 @@ def test_pull_communication_slices(stencil):
dst = s[1][:-1] dst = s[1][:-1]
break break
inner_slice = _fix_length_one_slices(get_slice_before_ghost_layer(d, ghost_layers=1)) inner_slice = _fix_length_one_slices(
get_slice_before_ghost_layer(d, ghost_layers=1)
)
inv_dir = (-e for e in d) inv_dir = (-e for e in d)
gl_slice = _fix_length_one_slices(get_ghost_region_slice(inv_dir, ghost_layers=1)) gl_slice = _fix_length_one_slices(
get_ghost_region_slice(inv_dir, ghost_layers=1)
)
assert src == inner_slice assert src == inner_slice
assert dst == gl_slice assert dst == gl_slice
@pytest.mark.parametrize('stencil_name', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]) @pytest.mark.parametrize("stencil", [Stencil.D2Q9, Stencil.D3Q19])
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
def test_direct_copy_and_kernels_equivalence(stencil: Stencil, streaming_pattern: str):
pytest.importorskip("cupy")
target = ps.Target.GPU
stencil = LBStencil(stencil)
domain_size = (4,) * stencil.D
dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
domain_size,
periodicity=(True,) * stencil.D,
parallel=False,
default_target=target,
)
pdfs_a = dh.add_array("pdfs_a", values_per_cell=stencil.Q)
pdfs_b = dh.add_array("pdfs_b", values_per_cell=stencil.Q)
for q in range(stencil.Q):
sl = ps.make_slice[:4, :4, q] if stencil.D == 2 else ps.make_slice[:4, :4, :4, q]
dh.cpu_arrays[pdfs_a.name][sl] = q
dh.cpu_arrays[pdfs_b.name][sl] = q
dh.all_to_gpu()
direct_copy = LBMPeriodicityHandling(stencil, dh, pdfs_a.name, streaming_pattern, cupy_direct_copy=True)
kernels_copy = LBMPeriodicityHandling(stencil, dh, pdfs_b.name, streaming_pattern, cupy_direct_copy=False)
direct_copy(Timestep.EVEN)
kernels_copy(Timestep.EVEN)
dh.all_to_cpu()
np.testing.assert_equal(
dh.cpu_arrays[pdfs_a.name],
dh.cpu_arrays[pdfs_b.name]
)
@pytest.mark.parametrize(
"stencil_name", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
def test_optimised_and_full_communication_equivalence(stencil_name): def test_optimised_and_full_communication_equivalence(stencil_name):
target = ps.Target.CPU target = ps.Target.CPU
stencil = LBStencil(stencil_name) stencil = LBStencil(stencil_name)
domain_size = (4, ) * stencil.D domain_size = (4,) * stencil.D
dh = ps.create_data_handling(domain_size, periodicity=(True, ) * stencil.D, dh = ps.create_data_handling(
parallel=False, default_target=target) domain_size,
periodicity=(True,) * stencil.D,
parallel=False,
default_target=target,
)
pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64) pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64)
dh.fill("pdf", 0, ghost_layers=True) dh.fill("pdf", 0, ghost_layers=True)
...@@ -82,9 +150,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name): ...@@ -82,9 +150,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
gl = dh.ghost_layers_of_field("pdf") gl = dh.ghost_layers_of_field("pdf")
num = 0 num = 0
for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']): for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
dh.cpu_arrays['pdf'][idx] = num dh.cpu_arrays["pdf"][idx] = num
dh.cpu_arrays['pdf_tmp'][idx] = num dh.cpu_arrays["pdf_tmp"][idx] = num
num += 1 num += 1
lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only") lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only")
...@@ -95,21 +163,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name): ...@@ -95,21 +163,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
ast = ps.create_kernel(ac, config=config) ast = ps.create_kernel(ac, config=config)
stream = ast.compile() stream = ast.compile()
full_communication = dh.synchronization_function(pdf.name, target=dh.default_target, optimization={"openmp": True}) full_communication = dh.synchronization_function(
pdf.name, target=dh.default_target, optimization={"openmp": True}
)
full_communication() full_communication()
dh.run_kernel(stream) dh.run_kernel(stream)
dh.swap("pdf", "pdf_tmp") dh.swap("pdf", "pdf_tmp")
pdf_full_communication = np.copy(dh.cpu_arrays['pdf']) pdf_full_communication = np.copy(dh.cpu_arrays["pdf"])
num = 0 num = 0
for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']): for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
dh.cpu_arrays['pdf'][idx] = num dh.cpu_arrays["pdf"][idx] = num
dh.cpu_arrays['pdf_tmp'][idx] = num dh.cpu_arrays["pdf_tmp"][idx] = num
num += 1 num += 1
optimised_communication = LBMPeriodicityHandling(stencil=stencil, data_handling=dh, pdf_field_name=pdf.name, optimised_communication = LBMPeriodicityHandling(
streaming_pattern='pull') stencil=stencil,
data_handling=dh,
pdf_field_name=pdf.name,
streaming_pattern="pull",
)
optimised_communication() optimised_communication()
dh.run_kernel(stream) dh.run_kernel(stream)
dh.swap("pdf", "pdf_tmp") dh.swap("pdf", "pdf_tmp")
...@@ -119,9 +193,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name): ...@@ -119,9 +193,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
for j in range(gl, domain_size[1]): for j in range(gl, domain_size[1]):
for k in range(gl, domain_size[2]): for k in range(gl, domain_size[2]):
for f in range(len(stencil)): for f in range(len(stencil)):
assert dh.cpu_arrays['pdf'][i, j, k, f] == pdf_full_communication[i, j, k, f], print(f) assert (
dh.cpu_arrays["pdf"][i, j, k, f]
== pdf_full_communication[i, j, k, f]
), print(f)
else: else:
for i in range(gl, domain_size[0]): for i in range(gl, domain_size[0]):
for j in range(gl, domain_size[1]): for j in range(gl, domain_size[1]):
for f in range(len(stencil)): for f in range(len(stencil)):
assert dh.cpu_arrays['pdf'][i, j, f] == pdf_full_communication[i, j, f] assert (
dh.cpu_arrays["pdf"][i, j, f] == pdf_full_communication[i, j, f]
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment