diff --git a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp index 803455e449f5d8673de519883ebcf128fbdd0c41..bdf4b46e71ac66b15fbe211d1db74d3cc2a42ad5 100644 --- a/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp +++ b/lib/walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp @@ -1,6 +1,6 @@ #pragma once -// #if defined(WALBERLA_BUILD_WITH_CUDA) || defined(WALBERLA_BUILD_WITH_HIP) +#if defined(WALBERLA_BUILD_WITH_CUDA) || defined(WALBERLA_BUILD_WITH_HIP) #include "core/all.h" @@ -28,45 +28,63 @@ concept UniformGpuFieldPackInfoImpl = requires(T impl, { impl.doUnpack(field, buffer, dir, ci, stream) } -> std::same_as< void >; + { impl.doLocalCopy(field, ci, field, ci, dir, stream) } -> std::same_as< void >; + { impl.elementsPerCell(dir) } -> std::same_as< uint_t >; -} +}; } // namespace detail -template< detail::UniformGpuFieldPackInfoImpl Impl > +template< typename Impl > class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo { public: - UniformGpuFieldPackInfoBase(BlockDataID fieldId, uint_t sliceWidth = 0) + // static_assert( detail::UniformGpuFieldPackInfoImpl< Impl >, "Impl does not satisfy contraints."); + + UniformGpuFieldPackInfoBase(BlockDataID fieldId, uint_t sliceWidth = 1) : fieldId_{ fieldId }, sliceWidth_{ sliceWidth } {} - void pack(stencil::Direction dir, unsigned char* bufferPtr, IBlock* block, gpuStream_t stream) override + void pack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override { using Field_T = typename Impl::Field_T; using value_type = typename Field_T::value_type; - Field_T& field = *block->getData< Field_T >(fieldId_); + Field_T * field = block->getData< Field_T >(fieldId_); CellInterval ci; field->getSliceBeforeGhostLayer(dir, ci, sliceWidth_, false); - std::span< value_type > buffer{ static_cast< value_type* >(bufferPtr), this->size(dir, block) }; - impl().doPack(field, buffer, dir, stream); + std::span< value_type > buffer{ ( value_type* ) rawBuffer, this->size(dir, block) }; + impl().doPack(field, buffer, dir, ci, stream); } - void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, gpuStream_t stream) override + void unpack(stencil::Direction dir, unsigned char* rawBuffer, IBlock* block, gpuStream_t stream) override { using Field_T = typename Impl::Field_T; using value_type = typename Field_T::value_type; - Field_T& field = *block->getData< Field_T >(fieldId_); + Field_T * field = block->getData< Field_T >(fieldId_); CellInterval ci; field->getGhostRegion(dir, ci, sliceWidth_, false); - std::span< value_type > buffer{ static_cast< value_type* >(bufferPtr), this->size(dir, block) }; - impl().doUnpack(field, buffer, dir, stream); + std::span< value_type > buffer{ (value_type*) rawBuffer, this->size(dir, block) }; + impl().doUnpack(field, buffer, dir, ci, stream); + } + + void communicateLocal ( stencil::Direction dir, const IBlock *sender, IBlock *receiver, gpuStream_t stream ) override { + using Field_T = typename Impl::Field_T; + + Field_T * srcField = const_cast< IBlock * >(sender)->getData< Field_T >(fieldId_); + Field_T * dstField = receiver->getData< Field_T >(fieldId_); + + CellInterval srcRegion; + CellInterval dstRegion; + srcField->getSliceBeforeGhostLayer(dir, srcRegion, sliceWidth_, false); + dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, sliceWidth_, false); + + impl().doLocalCopy(srcField, srcRegion, dstField, dstRegion, dir, stream); } uint_t size(stencil::Direction dir, IBlock* block) override { using Field_T = typename Impl::Field_T; - auto field = block->getData< Field_T >(fieldId_); + const Field_T * field = block->getData< Field_T >(fieldId_); CellInterval ci; field->getGhostRegion(dir, ci, 1, false); @@ -84,4 +102,4 @@ class UniformGpuFieldPackInfoBase : public gpu::GeneratedGPUPackInfo } // namespace walberla::experimental::communication -// #endif +#endif diff --git a/src/walberla/codegen/api.py b/src/walberla/codegen/api.py index 766c6fc48653f74d6398c9c71235f82f2f2620ae..284b31766546141768791ca7d544ba29151d4238 100644 --- a/src/walberla/codegen/api.py +++ b/src/walberla/codegen/api.py @@ -264,7 +264,7 @@ class GhostLayerFieldPtr(GenericWalberlaField): ) @staticmethod - def create(field: Field): + def create(field: Field, const: bool = False): if field.index_dimensions > 1: raise ValueError( "Cannot map fields with more than one index dimension to field::GhostLayerField." @@ -279,15 +279,11 @@ class GhostLayerFieldPtr(GenericWalberlaField): fsize = field.index_shape[0] if field.index_shape else 1 - return GhostLayerFieldPtr(element_type, fsize).var(field.name) + return GhostLayerFieldPtr(element_type, fsize, const=const).var(field.name) - def __init__( - self, - element_type: UserTypeSpec, - fsize: int, - ): + def __init__(self, element_type: UserTypeSpec, fsize: int, const: bool = False): element_type = create_type(element_type) - field_type = self._template(element_type=element_type, fsize=fsize) + field_type = self._template(element_type=element_type, fsize=fsize, const=const) super().__init__(element_type, field_type, ptr=True) @@ -299,7 +295,7 @@ class GpuFieldPtr(GenericWalberlaField): ) @staticmethod - def create(field: Field): + def create(field: Field, const: bool = False): if field.index_dimensions > 1: raise ValueError( "Cannot map fields with more than one index dimension to gpu::GpuField." @@ -314,15 +310,16 @@ class GpuFieldPtr(GenericWalberlaField): fsize = field.index_shape[0] if field.index_shape else 1 - return GpuFieldPtr(element_type, fsize).var(field.name) + return GpuFieldPtr(element_type, fsize, const=const).var(field.name) def __init__( self, element_type: UserTypeSpec, fsize: int, + const: bool = False, ): element_type = create_type(element_type) - field_type = self._template(element_type=element_type) + field_type = self._template(element_type=element_type, const=const) super().__init__(element_type, field_type, ptr=True) diff --git a/src/walberla/codegen/communication/pack_infos.py b/src/walberla/codegen/communication/pack_infos.py index 9989c9ca025fe52c9d78b741e30aac045cc310ca..229bc9dc8cbf5d08a74bc20718140e1bbe25d3b6 100644 --- a/src/walberla/codegen/communication/pack_infos.py +++ b/src/walberla/codegen/communication/pack_infos.py @@ -11,16 +11,20 @@ from pystencils import ( Target, ) from pystencils.stencil import offset_to_direction_string +from pystencils.codegen.properties import FieldBasePtr from lbmpy import LBStencil from pystencilssfg import SfgComposer from pystencilssfg.composer.basic_composer import KernelsAdder from pystencilssfg.composer.custom import CustomGenerator -from pystencilssfg.ir import SfgKernelHandle +from pystencilssfg.ir import SfgKernelHandle, SfgEmptyNode +from pystencilssfg.ir.call_tree import SfgCallTreeNode +from pystencilssfg.ir.postprocessing import PostProcessingContext, SfgDeferredNode +from pystencilssfg.lang import SfgKernelParamVar, AugExpr, strip_ptr_ref from pystencilssfg.lang.cpp import std from pystencilssfg.lang.gpu import CudaAPI, HipAPI -from ..api import GpuFieldPtr, Direction, CellInterval +from ..api import GpuFieldPtr, Direction, CellInterval, uint_t from ..build_config import get_build_config @@ -31,6 +35,26 @@ class PackingKernelsContext: cfg: CreateKernelConfig +@dataclass +class CaptureBufferPointer(SfgDeferredNode): + sfg: SfgComposer + buffer_name: str + buffer_span: std.span + + def expand(self, ppc: PostProcessingContext) -> SfgCallTreeNode: + for param in ppc.live_variables: + if ( + isinstance(param, SfgKernelParamVar) + and param.wrapped.fields[0].name == self.buffer_name + and param.wrapped.get_properties(FieldBasePtr) + ): + return self.sfg.init(param)( + AugExpr.format("{}.data()", self.buffer_span) + ) + + return SfgEmptyNode() + + class GpuFieldPackInfo(CustomGenerator): def __init__(self, name: str, stencil: LBStencil, field: Field): if field.index_dimensions > 1: @@ -47,9 +71,18 @@ class GpuFieldPackInfo(CustomGenerator): self._stencil = stencil self._field = field self._dtype = field.dtype + self._src_field = Field.new_field_with_different_name( + self._field, f"{self._field.name}_src" + ) + self._dst_field = Field.new_field_with_different_name( + self._field, f"{self._field.name}_dst" + ) def generate(self, sfg: SfgComposer) -> None: - base_class = f"walberla::experimental::communication::UniformGpuFieldPackInfoImpl< {self._name } >" + base_class = f"walberla::experimental::communication::UniformGpuFieldPackInfoBase< {self._name } >" + sfg.include( + "walberla/experimental/communication/UniformGpuFieldPackInfoBase.hpp" + ) build_config = get_build_config(sfg) @@ -72,51 +105,105 @@ class GpuFieldPackInfo(CustomGenerator): pack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict() unpack_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict() + local_copy_kernels: dict[tuple[int, int, int], SfgKernelHandle] = dict() + + comm_dirs = [c for c in self._stencil if not all(x == 0 for x in c)] - for comm_dir in self._stencil: - pack_kernels[comm_dir] = self._do_pack(pkc, comm_dir) - unpack_kernels[comm_dir] = self._do_unpack(pkc, comm_dir) + for comm_dir in comm_dirs: + if not all(c == 0 for c in comm_dir): + pack_kernels[comm_dir] = self._do_pack(pkc, comm_dir) + unpack_kernels[comm_dir] = self._do_unpack(pkc, comm_dir) + local_copy_kernels[comm_dir] = self._do_local_copy(pkc, comm_dir) - gpu_field = GpuFieldPtr.create(self._field) + src_gpu_field = GpuFieldPtr.create(self._src_field) + gpu_field_type = strip_ptr_ref(src_gpu_field.get_dtype()) + dst_gpu_field = GpuFieldPtr.create(self._dst_field) buffer_span = std.span(self._dtype).var("buffer") dir = Direction().var("dir") - ci = CellInterval().var("ci") - stream = GpuAPI.stream_t().var("stream") + src_interval = CellInterval(const=True, ref=True).var("srcInterval") + dst_interval = CellInterval(const=True, ref=True).var("dstInterval") - common_buffer = self._buffer(1) + stream = GpuAPI.stream_t().var("stream") sfg.klass(self._name, bases=[f"public {base_class}"])( sfg.public( - f"using Field_T = {gpu_field.get_dtype().c_string()};", - sfg.method("doPack").params(gpu_field, buffer_span, dir, ci, stream)( - sfg.map_field(self._field, gpu_field), - sfg.map_field(common_buffer, buffer_span), - sfg.switch(dir).cases( + f"using Base = {base_class};", + "using Base::Base;", + f"using Field_T = {gpu_field_type.c_string()};", + sfg.method("doPack").params( + src_gpu_field, buffer_span, dir, src_interval, stream + )( + sfg.map_field( + self._src_field, src_gpu_field.with_cell_interval(src_interval) + ), + CaptureBufferPointer(sfg, "buffer", buffer_span), + sfg.switch(dir) + .cases( { f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke( pack_kernels[comm_dir], stream=stream ) - for comm_dir in self._stencil + for comm_dir in comm_dirs } - ), + ) + .default("/* unreachable */"), ), - sfg.method("doUnpack").params(gpu_field, buffer_span, dir, ci, stream)( - sfg.map_field(self._field, gpu_field), - sfg.map_field(common_buffer, buffer_span), - sfg.switch(dir).cases( + sfg.method("doUnpack").params( + src_gpu_field, buffer_span, dir, dst_interval, stream + )( + sfg.map_field( + self._dst_field, src_gpu_field.with_cell_interval(dst_interval) + ), + CaptureBufferPointer(sfg, "buffer", buffer_span), + sfg.switch(dir) + .cases( { f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke( unpack_kernels[comm_dir], stream=stream ) - for comm_dir in self._stencil + for comm_dir in comm_dirs } + ) + .default("/* unreachable */"), + ), + sfg.method("doLocalCopy").params( + src_gpu_field, + src_interval, + dst_gpu_field, + dst_interval, + dir, + stream, + )( + sfg.map_field( + self._src_field, src_gpu_field.with_cell_interval(src_interval) + ), + sfg.map_field( + self._dst_field, dst_gpu_field.with_cell_interval(dst_interval) ), + sfg.switch(dir) + .cases( + { + f"walberla::stencil::Direction::{offset_to_direction_string(comm_dir)}": sfg.gpu_invoke( + local_copy_kernels[comm_dir], stream=stream + ) + for comm_dir in comm_dirs + } + ) + .default("/* unreachable */"), ), + sfg.method("elementsPerCell") + .inline() + .const() + .params(dir) + .returns(uint_t)(f"return {self._field.index_shape[0]};"), ) ) - def _pack_accesses(self, comm_dir: tuple[int, int, int]): - return [self._field.center(i) for i in range(self._field.index_shape[0])] + def _pack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]: + return [self._src_field.center(i) for i in range(self._field.index_shape[0])] + + def _unpack_accesses(self, comm_dir: tuple[int, int, int]) -> list[Field.Access]: + return [self._dst_field.center(i) for i in range(self._field.index_shape[0])] def _do_pack( self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int] @@ -130,12 +217,22 @@ class GpuFieldPackInfo(CustomGenerator): def _do_unpack( self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int] ) -> SfgKernelHandle: - pack_accs = self._pack_accesses(comm_dir) - buffer = self._buffer(len(pack_accs)) - asms = [Assignment(acc, buffer(i)) for i, acc in enumerate(pack_accs)] + unpack_accs = self._unpack_accesses(comm_dir) + buffer = self._buffer(len(unpack_accs)) + asms = [Assignment(acc, buffer(i)) for i, acc in enumerate(unpack_accs)] dir_str = offset_to_direction_string(comm_dir) return pkc.kns.create(asms, f"unpack{dir_str}", pkc.cfg) + def _do_local_copy( + self, pkc: PackingKernelsContext, comm_dir: tuple[int, int, int] + ) -> SfgKernelHandle: + pack_accs = self._pack_accesses(comm_dir) + unpack_accs = self._unpack_accesses(comm_dir) + + asms = [Assignment(dst, src) for dst, src in zip(unpack_accs, pack_accs)] + dir_str = offset_to_direction_string(comm_dir) + return pkc.kns.create(asms, f"localCopy{dir_str}", pkc.cfg) + def _buffer(self, num_elems: int): return Field.create_generic( "buffer", diff --git a/tests/BasicLbmScenarios/LbmAlgorithms.py b/tests/BasicLbmScenarios/LbmAlgorithms.py index eefe063997ded8cbdb183b1244336479f8514e8e..9658792ce11f159b680eb62e5514d03f53a1a655 100644 --- a/tests/BasicLbmScenarios/LbmAlgorithms.py +++ b/tests/BasicLbmScenarios/LbmAlgorithms.py @@ -16,6 +16,7 @@ from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from walberla.codegen import Sweep, get_build_config from walberla.codegen.boundaries import NoSlip, FreeSlip +from walberla.codegen.communication import GpuFieldPackInfo from walberla.codegen.build_config import DEBUG_MOCK_CMAKE @@ -31,17 +32,19 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg: match args.target: case "cpu": - build_config.override.target = Target.CurrentCPU + target = Target.CurrentCPU sfg.code("#define LBM_SCENARIOS_CPU_BUILD true") case "hip": - build_config.override.target = Target.HIP + target = Target.HIP sfg.code("#define LBM_SCENARIOS_GPU_BUILD true") case "cuda": - build_config.override.target = Target.CUDA + target = Target.CUDA sfg.code("#define LBM_SCENARIOS_GPU_BUILD true") case _: raise ValueError(f"Unexpected target id: {args.target}") + build_config.override.target = target + sfg.namespace("BasicLbmScenarios::gen") stencil = LBStencil(Stencil.D3Q19) @@ -117,3 +120,8 @@ with SourceFileGenerator(keep_unknown_argv=True) as sfg: wall_orientation=FreeSlip.IRREGULAR, ) sfg.generate(irreg_freeslip) + + if build_config.override.target.is_gpu(): + with sfg.namespace("comm"): + pack_info = GpuFieldPackInfo("GpuPdfsPackInfo", stencil, f) + sfg.generate(pack_info) diff --git a/tests/BasicLbmScenarios/SimDomain.hpp b/tests/BasicLbmScenarios/SimDomain.hpp index 4417699a8d2eec630f684ce46755fee0fecd45dd..508adb9d38aa80e6ae1e71be9f6bab82b20fc56c 100644 --- a/tests/BasicLbmScenarios/SimDomain.hpp +++ b/tests/BasicLbmScenarios/SimDomain.hpp @@ -45,7 +45,7 @@ using CommonGpuField = gpu::GPUField< PdfField_T::value_type >; using GpuCommScheme = gpu::communication::UniformGPUScheme< gen::LbStencil >; // using GpuPdfsPackInfo = gpu::communication::MemcpyPackInfo< CommonGpuField >; -using GpuPdfsPackInfo = gpu::communication::GPUPackInfo< CommonGpuField >; +using GpuPdfsPackInfo = gen::comm::GpuPdfsPackInfo; #endif struct SimDomain @@ -70,7 +70,7 @@ struct SimDomain const BlockDataID uId; } gpuFields; - // GpuCommScheme commGpu; + GpuCommScheme commGpu; void initFromFields(const Vector3< real_t > force) { @@ -121,7 +121,7 @@ struct SimDomain void syncGhostLayers() { // WALBERLA_GPU_CHECK(gpuPeekAtLastError()); - commCpu(); + commGpu(); } void fields2host() @@ -242,11 +242,10 @@ struct SimDomainBuilder const BlockDataID rhoIdGpu = gpu::addGPUFieldToStorage< ScalarField_T >(sbfs, rhoId, "rho_gpu"); const BlockDataID uIdGpu = gpu::addGPUFieldToStorage< VectorField_T >(sbfs, uId, "u_gpu"); - // GpuCommScheme commGpu{ sbfs }; - // auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu); - // commGpu.addPackInfo(gpuPdfsPackInfo); + GpuCommScheme commGpu{ sbfs }; auto gpuPdfsPackInfo = std::make_shared< GpuPdfsPackInfo >(pdfsIdGpu); - commCpu.addPackInfo(gpuPdfsPackInfo); + commGpu.addPackInfo(gpuPdfsPackInfo); + // commCpu.addPackInfo(gpuPdfsPackInfo); #endif return @@ -265,7 +264,7 @@ struct SimDomainBuilder .rhoId = rhoIdGpu, .uId = uIdGpu }, - // .commGpu = commGpu + .commGpu = commGpu #endif }; }