diff --git a/src/pystencils_autodiff/graph_datahandling.py b/src/pystencils_autodiff/graph_datahandling.py index f33b89f5797c525e4a6539bd684a453aac2d9c90..0e68177b70a83775270a84ca2743b67c5b1c89ff 100644 --- a/src/pystencils_autodiff/graph_datahandling.py +++ b/src/pystencils_autodiff/graph_datahandling.py @@ -7,6 +7,7 @@ """ """ +from copy import copy from enum import Enum import numpy as np @@ -37,6 +38,17 @@ class DataTransferKind(str, Enum): return self in [self.HOST_TO_DEVICE, self.DEVICE_TO_HOST, self.SWAP] +# class BoundaryHandling: + # def __init__(self, field: pystencils.Field): + # self.field = field + + # def __str__(self): + # return f'BoundaryHandling on {self.field}' + + # def __repr__(self): + # return self.__str__() + + class DataTransfer: def __init__(self, field: pystencils.Field, kind: DataTransferKind): self.field = field @@ -67,9 +79,10 @@ class Communication(DataTransfer): class KernelCall: - def __init__(self, kernel: pystencils.kernel_wrapper.KernelWrapper, kwargs): + def __init__(self, kernel: pystencils.kernel_wrapper.KernelWrapper, kwargs, tmp_field_swaps=[]): self.kernel = kernel self.kwargs = kwargs + self.tmp_field_swaps = tmp_field_swaps def __str__(self): return "Call " + str(self.kernel.ast.function_name) @@ -91,7 +104,7 @@ class TimeloopRun: @property def asts(self): - return self.timeloop._single_step_asts + return [s.kernel.ast for s in self.timeloop._single_step_asts if hasattr(s, 'kernel')] class GraphDataHandling(pystencils.datahandling.SerialDataHandling): @@ -113,17 +126,22 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling): self._single_step_functions.append(f) def add_call(self, functor, argument_list): - for argument_dict in argument_list: - self._single_step_asts.append((functor, argument_dict) if not hasattr(functor, 'ast') else functor.ast) - - if hasattr(functor, 'kernel'): - functor = functor.kernel - if not isinstance(argument_list, list): - argument_list = [argument_list] + if hasattr(functor, 'ast'): + self._single_step_asts.append(KernelCall(functor, {})) + else: + former_queue = self.parent.call_queue + self.parent.call_queue = [] + # + functor(*argument_list) + self._single_step_asts.extend(self.parent.call_queue) + self.parent.call_queue = former_queue def run(self, time_steps=1): - self.parent.call_queue.append(TimeloopRun(self, time_steps)) + former_call_queue = copy(self.parent.call_queue) + self.parent.call_queue = [] super().run(time_steps) + self.parent.call_queue = former_call_queue + former_call_queue.append(TimeloopRun(self, time_steps)) def swap(self, src, dst, is_gpu): if isinstance(src, str): @@ -135,10 +153,20 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling): def __init__(self, *args, **kwargs): self.call_queue = [] + self._timeloop_record = None super().__init__(*args, **kwargs) - def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None, layout=None, - cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC): + def add_array(self, + name, + values_per_cell=1, + dtype=np.float64, + latex_name=None, + ghost_layers=None, + layout=None, + cpu=True, + gpu=None, + alignment=False, + field_type=FieldType.GENERIC): super().add_array(name, values_per_cell, @@ -189,10 +217,17 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling): self.call_queue.append(DataTransfer(self._fields[name], DataTransferKind.HOST_TO_DEVICE)) def synchronization_function(self, names, stencil=None, target=None, **_): - for name in names: - gpu = target == 'gpu' - self.call_queue.append(Communication(self._fields[name], stencil, gpu)) - super().synchronization_function(names, stencil=None, target=None, **_) + def func(): + for name in names: + gpu = target == 'gpu' + self.call_queue.append(Communication(self._fields[name], stencil, gpu)) + pystencils.datahandling.SerialDataHandling.synchronization_function(self, + names, + stencil=None, + target=None, + **_) + + return func def __str__(self): return '\n'.join(str(c) for c in self.call_queue) @@ -205,6 +240,23 @@ class GraphDataHandling(pystencils.datahandling.SerialDataHandling): self.call_queue.append('Fill ' + array_name) super().fill(array_name, val, value_idx, slice_obj, ghost_layers, inner_ghost_layers) + def merge_swaps_with_kernel_calls(self, call_queue=None): + # TODO(seitz): should be moved to ComputationGraph + + if call_queue is None: + call_queue = self.call_queue + + relevant_swaps = [(swap, predecessor) for (swap, predecessor) in zip(call_queue[1:], call_queue[:-1]) + if isinstance(swap, Swap) and isinstance(predecessor, KernelCall)] + for s, pred in relevant_swaps: + call_queue.remove(s) + if (s.field, s.destination) not in pred.tmp_field_swaps: + pred.tmp_field_swaps.append((s.field, s.destination)) + + for t in call_queue: + if isinstance(t, TimeloopRun): + self.merge_swaps_with_kernel_calls(t.timeloop._single_step_asts) + # TODO # def reduce_float_sequence(self, sequence, operation, all_reduce=False) -> np.array: # return np.array(sequence) diff --git a/src/pystencils_autodiff/walberla.py b/src/pystencils_autodiff/walberla.py index cf2839d35ea5b46fc01d45252bb2adf405f9f059..87d2066eb6ab661f10f7ae66a0d5a0fa16c8703d 100644 --- a/src/pystencils_autodiff/walberla.py +++ b/src/pystencils_autodiff/walberla.py @@ -17,20 +17,25 @@ from stringcase import camelcase, pascalcase import pystencils from pystencils.astnodes import SympyAssignment -from pystencils.data_types import TypedSymbol +from pystencils.data_types import TypedSymbol, create_type from pystencils_autodiff._file_io import read_template_from_file from pystencils_autodiff.framework_integration.astnodes import JinjaCppFile +def _make_field_type(field, on_gpu): + from pystencils_walberla.jinja_filters import make_field_type, get_field_fsize + f_size = get_field_fsize(field) + + return make_field_type(pystencils.data_types.get_base_type(field.dtype), f_size, on_gpu) + + class FieldType(JinjaCppFile): TEMPLATE = jinja2.Template("{{ field_type }}") def __init__(self, field: pystencils.Field, on_gpu: bool): - from pystencils_walberla.jinja_filters import make_field_type, get_field_fsize - f_size = get_field_fsize(field) - field_type = make_field_type(pystencils.data_types.get_base_type(field.dtype), f_size, on_gpu) + field_type = _make_field_type(field, on_gpu) ast_dict = {'on_gpu': on_gpu, 'field_type': field_type @@ -393,16 +398,22 @@ class FillFromFlagField(JinjaCppFile): class LbCommunicationSetup(JinjaCppFile): - TEMPLATE = jinja2.Template("""blockforest::communication::UniformBufferedScheme<lbm::{{ lb_model_type }}::CommunicationStencil> {{ communication }}( blocks ); -{{ communication }}.addPackInfo( make_shared< lbm::PdfFieldPackInfo<lbm::{{ lb_model_type }}> >( {{ pdf_id }} ) ); + TEMPLATE = jinja2.Template(""" +{{CommonicationScheme}}<lbm::{{ lb_model_type }}::CommunicationStencil> {{ communication }}( blocks ); +{{ communication }}.addPackInfo( make_shared< {{ packinfo_class }} >( {{ pdf_id }} ) ); """) # noqa - def __init__(self, lb_model_type, pdf_id): + def __init__(self, lb_model_type, pdf_id, packinfo_class, gpu): self._symbol = TypedSymbol('communication', 'auto') + self.gpu = gpu + ast_dict = { 'lb_model_type': lb_model_type, 'pdf_id': pdf_id, 'communication': self._symbol, + 'packinfo_class': packinfo_class, + 'CommonicationScheme': ('cuda::communication::UniformGPUScheme' + if gpu else 'blockforest::communication::UniformBufferedScheme') } super().__init__(ast_dict) @@ -414,7 +425,14 @@ class LbCommunicationSetup(JinjaCppFile): def symbols_defined(self): return {self.symbol} - headers = ['"blockforest/communication/UniformBufferedScheme.h"', '"lbm/communication/PdfFieldPackInfo.h"'] + @property + def headers(self): + if self.gpu: + return ['"cuda/communication/UniformGPUScheme.h"', '"lbm/communication/PdfFieldPackInfo.h"', '"PackInfo.h"'] + else: + return ['"blockforest/communication/UniformBufferedScheme.h"', + '"lbm/communication/PdfFieldPackInfo.h"', + '"PackInfo.h"'] class BeforeFunction(JinjaCppFile): @@ -490,6 +508,40 @@ class TimeLoop(JinjaCppFile): headers = ['"timeloop/all.h"'] +class ForLoop(JinjaCppFile): + + TEMPLATE = jinja2.Template(""" +for( {{node.loop_symbol.dtype}} {{node.loop_symbol}} = {{node.loop_start}}; {{node.loop_symbol}} <= {{node.loop_end}}; {{node.loop_symbol}}+= {{node.loop_increment}} ) { +{%- for c in children %} +{{ c | indent(3) }} +{%- endfor -%} +} +""") # noqa + + def __init__(self, + loop_start, + loop_end, + children, + loop_symbol=TypedSymbol('t_', create_type('int64')), + loop_increment=1): + ast_dict = { + 'loop_symbol': loop_symbol, + 'loop_start': loop_start, + 'loop_end': loop_end, + 'children': children, + 'loop_increment': loop_increment + } + super().__init__(ast_dict) + + @property + def loop_symbol(self): + return self.ast_dict.loop_symbol + + @property + def symbols_defined(self): + return {self.loop_symbol} + + class U_Rho_Adaptor(JinjaCppFile): """Docstring for U_Rho_Adaptor. """ @@ -683,7 +735,13 @@ class InitBoundaryHandling(JinjaCppFile): {% endfor %} """) # noqa - def __init__(self, block_forest, flag_field_id, pdf_field_id, boundary_conditions, boundary_kernel: dict, field_allocations): + def __init__(self, + block_forest, + flag_field_id, + pdf_field_id, + boundary_conditions, + boundary_kernel: dict, + field_allocations): self.fluid = FlagUidDefinition("fluid") ast_dict = {'fluid_uid_definition': self.fluid, 'geometry_initialization': BoundaryHandlingFromConfig(block_forest, @@ -768,7 +826,11 @@ class GeneratedBoundaryInitialization(JinjaCppFile): class SweepCreation(JinjaCppFile): TEMPLATE = jinja2.Template("""{{ sweep_class_name }}( {{ parameter_str }} )""") # noqa - def __init__(self, sweep_class_name: str, field_allocation: AllocateAllFields, ast, parameters_to_ignore=None): + def __init__(self, + sweep_class_name: str, + field_allocation: AllocateAllFields, + ast, + parameters_to_ignore=[]): def resolve_parameter(p): if ast.target == 'cpu': dict = field_allocation._cpu_allocations @@ -780,7 +842,8 @@ class SweepCreation(JinjaCppFile): parameters = ast.get_parameters() parameter_ids = [resolve_parameter(p) for p in parameters - if p.is_field_pointer or not p.is_field_parameter] + if (p.is_field_pointer and p.symbol.field_name not in parameters_to_ignore) + or not p.is_field_parameter] ast_dict = {'sweep_class_name': sweep_class_name, 'parameter_ids': parameter_ids, @@ -806,3 +869,52 @@ for( auto& block : *{{block_forest}}) {{sweep_class_name | lower() }}(&block);"" 'sweep_class_name': functor.ast_dict.sweep_class_name, 'block_forest': block_forest} super().__init__(ast_dict) + + +class FieldCopy(JinjaCppFile): + TEMPLATE = jinja2.Template("""cuda::fieldCpy<{{ src_type }}, {{ dst_type }}>({{ block_forest }}, {{ src_id }}, {{ dst_id }});""") # noqa + + def __init__(self, block_forest, src_id, src_field, src_gpu, dst_id, dst_field, dst_gpu): + src_type = _make_field_type(src_field, src_gpu) + dst_type = _make_field_type(dst_field, dst_gpu) + ast_dict = {'src_id': src_id, + 'dst_id': dst_id, + 'src_type': src_type, + 'dst_type': dst_type, + 'block_forest': block_forest} + super().__init__(ast_dict) + + headers = ['"cuda/FieldCopy.h"'] + + +class SwapFields(JinjaCppFile): + """ + .. warn:: + + Prefer temporary fields in sweeps over this class! Two full fields have higher memory usage. + + """ + TEMPLATE = jinja2.Template("""std::swap({{field_id0}}, {{field_id1}});""") + + def __init__(self, field_id0, field_id1): + ast_dict = {'field_id0': field_id0, + 'field_id1': field_id1} + super().__init__(ast_dict) + + headers = ["<algorithm>"] + + +class Communication(JinjaCppFile): + """ + .. warn:: + + Prefer temporary fields in sweeps over this class! Two full fields have higher memory usage. + + """ + TEMPLATE = jinja2.Template("""communication()""") + + def __init__(self, gpu): + ast_dict = {'gpu': gpu} + super().__init__(ast_dict) + + headers = ["<algorithm>"] diff --git a/src/pystencils_autodiff/wald_und_wiesen_simulation.py b/src/pystencils_autodiff/wald_und_wiesen_simulation.py index 662a6d0f0d04dcee9c46645f1913f67eb99bf38a..9100dfb82720c616f9ecf2c35bbb75c030aef27d 100644 --- a/src/pystencils_autodiff/wald_und_wiesen_simulation.py +++ b/src/pystencils_autodiff/wald_und_wiesen_simulation.py @@ -7,19 +7,20 @@ """ +import itertools from typing import Dict import sympy as sp -from stringcase import pascalcase +from stringcase import camelcase, pascalcase import lbmpy_walberla import pystencils import pystencils_walberla.codegen from pystencils.astnodes import Block, EmptyLine from pystencils_autodiff.walberla import ( - AllocateAllFields, CMakeLists, DefinitionsHeader, InitBoundaryHandling, LbCommunicationSetup, - ResolveUndefinedSymbols, RunTimeLoop, SweepCreation, SweepOverAllBlocks, TimeLoop, - UniformBlockforestFromConfig, WalberlaMain, WalberlaModule) + AllocateAllFields, CMakeLists, DefinitionsHeader, FieldCopy, InitBoundaryHandling, + LbCommunicationSetup, ResolveUndefinedSymbols, RunTimeLoop, SwapFields, SweepCreation, + SweepOverAllBlocks, TimeLoop, UniformBlockforestFromConfig, WalberlaMain, WalberlaModule) class WaldUndWiesenSimulation(): @@ -35,7 +36,8 @@ class WaldUndWiesenSimulation(): codegen_context, boundary_handling: pystencils.boundaries.BoundaryHandling = None, lb_rule=None, - refinement_scaling=None): + refinement_scaling=None, + boundary_handling_target='gpu'): self._data_handling = graph_data_handling self._lb_rule = lb_rule self._refinement_scaling = refinement_scaling @@ -49,6 +51,17 @@ class WaldUndWiesenSimulation(): self._with_gui = False self._with_gui_default = False self._boundary_kernels = {} + self._boundary_handling_target = boundary_handling_target + pystencils_walberla.codegen.generate_pack_info_for_field( + self._codegen_context, + 'PackInfo', + pystencils.Field.create_generic(graph_data_handling.fields['ldc_pdf'].name, + graph_data_handling.fields['ldc_pdf'].spatial_dimensions, + graph_data_handling.fields['ldc_pdf'].dtype.numpy_dtype, + graph_data_handling.fields['ldc_pdf'].index_dimensions, + index_shape=graph_data_handling.fields['ldc_pdf'].index_shape,), + target=self._boundary_handling_target) + self._packinfo_class = 'PackInfo' def _create_helper_files(self) -> Dict[str, str]: if self._lb_rule: @@ -58,7 +71,13 @@ class WaldUndWiesenSimulation(): if self._boundary_handling: for bc in self.boundary_conditions: self._boundary_kernels.update({bc.name: lbmpy_walberla.generate_boundary( - self._codegen_context, pascalcase(bc.name), bc, self._lb_rule.method)}) + self._codegen_context, + pascalcase(bc.name), + bc, + self._lb_rule.method, + target=self._boundary_handling_target)}, + ) + self._bh_cycler = itertools.cycle(self._boundary_kernels.keys()) def _create_module(self): if self._lb_rule: @@ -77,10 +96,11 @@ class WaldUndWiesenSimulation(): if self._lb_rule: pdf_field_id = field_allocations._gpu_allocations.get( - 'ldc_pdfSrc', field_allocations._cpu_allocations['ldc_pdfSrc']).symbol + 'ldc_pdf', field_allocations._cpu_allocations['ldc_pdf']).symbol else: pdf_field_id = None + self._data_handling.merge_swaps_with_kernel_calls() call_nodes = filter(lambda x: x, [self._graph_to_sweep(c) for c in self._data_handling.call_queue]) module = WalberlaModule(WalberlaMain(Block([ @@ -96,7 +116,9 @@ class WaldUndWiesenSimulation(): self._field_allocations) if self._boundary_handling else EmptyLine(), LbCommunicationSetup(self._lb_model_name, - pdf_field_id) + pdf_field_id, + self._packinfo_class, + self._boundary_handling_target) if self._lb_rule else EmptyLine(), *call_nodes ]), self.parameter_config_block) @@ -130,28 +152,79 @@ class WaldUndWiesenSimulation(): return self._boundary_handling._boundary_object_to_boundary_info.keys() def _graph_to_sweep(self, c): - from pystencils_autodiff.graph_datahandling import KernelCall, TimeloopRun + from pystencils_autodiff.graph_datahandling import KernelCall, TimeloopRun, DataTransferKind, DataTransfer if isinstance(c, KernelCall): sweep_class_name = next(self._kernel_class_generator) pystencils_walberla.codegen.generate_sweep( self._codegen_context, sweep_class_name, c.kernel.ast) - rtn = SweepOverAllBlocks(SweepCreation(sweep_class_name, self._field_allocations, - c.kernel.ast), self._block_forest.blocks) + rtn = SweepOverAllBlocks(SweepCreation(sweep_class_name, + self._field_allocations, + c.kernel.ast, + parameters_to_ignore=[s[1].name for s in c.tmp_field_swaps]), + self._block_forest.blocks) elif isinstance(c, TimeloopRun): sweeps = [] for a in c.timeloop._single_step_asts: - if 'indexField' in [f.name for f in a.fields_accessed]: - continue - sweep_class_name = next(self._kernel_class_generator) - pystencils_walberla.codegen.generate_sweep( - self._codegen_context, sweep_class_name, a) - sweeps.append(SweepCreation(sweep_class_name, self._field_allocations, a)) + if isinstance(a, KernelCall): + if 'indexField' in [f.name for f in a.kernel.ast.fields_accessed]: + bh = next(self._bh_cycler) + sweeps.append(camelcase(bh)) + continue + sweep_class_name = next(self._kernel_class_generator) + pystencils_walberla.codegen.generate_sweep( + self._codegen_context, sweep_class_name, a.kernel.ast, field_swaps=a.tmp_field_swaps) + sweeps.append(SweepCreation(sweep_class_name, + self._field_allocations, + a.kernel.ast, + parameters_to_ignore=[s[1].name for s in a.tmp_field_swaps])) + + elif isinstance(c, DataTransfer): + if c.kind in (DataTransferKind.HOST_COMMUNICATION, DataTransferKind.DEVICE_COMMUNICATION): + src = self._field_allocations._cpu_allocations[c.field.name].symbol + dst = self._field_allocations._cpu_allocations[c.destination.name].symbol + rtn = SwapFields(src, dst) + elif c.kind == DataTransferKind.DEVICE_SWAP: + src = self._field_allocations._gpu_allocations[c.field.name].symbol + dst = self._field_allocations._gpu_allocations[c.destination.name].symbol + rtn = SwapFields(src, dst) + elif c.kind == DataTransferKind.HOST_TO_DEVICE: + src = self._field_allocations._cpu_allocations[c.field.name].symbol + dst = self._field_allocations._gpu_allocations[c.field.name].symbol + rtn = FieldCopy(self._block_forest.blocks, src, c.field, False, dst, c.field, True) + elif c.kind == DataTransferKind.DEVICE_TO_HOST: + src = self._field_allocations._gpu_allocations[c.field.name].symbol + dst = self._field_allocations._cpu_allocations[c.field.name].symbol + rtn = FieldCopy(self._block_forest.blocks, src, c.field, True, dst, c.field, False) + else: + rtn = None + else: + print(f'time {c}') loop = TimeLoop(self._block_forest.blocks, [], sweeps, [], sp.S(c.time_steps)) rtn = Block([loop, RunTimeLoop(self._block_forest.blocks, loop, self._with_gui, self._with_gui_default)]) + elif isinstance(c, DataTransfer): + if c.kind == DataTransferKind.HOST_SWAP: + src = self._field_allocations._cpu_allocations[c.field.name].symbol + dst = self._field_allocations._cpu_allocations[c.destination.name].symbol + rtn = SwapFields(src, dst) + elif c.kind == DataTransferKind.DEVICE_SWAP: + src = self._field_allocations._gpu_allocations[c.field.name].symbol + dst = self._field_allocations._gpu_allocations[c.destination.name].symbol + rtn = SwapFields(src, dst) + elif c.kind == DataTransferKind.HOST_TO_DEVICE: + src = self._field_allocations._cpu_allocations[c.field.name].symbol + dst = self._field_allocations._gpu_allocations[c.field.name].symbol + rtn = FieldCopy(self._block_forest.blocks, src, c.field, False, dst, c.field, True) + elif c.kind == DataTransferKind.DEVICE_TO_HOST: + src = self._field_allocations._gpu_allocations[c.field.name].symbol + dst = self._field_allocations._cpu_allocations[c.field.name].symbol + rtn = FieldCopy(self._block_forest.blocks, src, c.field, True, dst, c.field, False) + else: + rtn = None else: - return None + rtn = None + return rtn diff --git a/tests/test_walberla.py b/tests/test_walberla.py index a9954b779baf1969b22f5810b185de2e1232555a..40ae8cbe6646b14f63d496e7cae05e461d2c18bb 100644 --- a/tests/test_walberla.py +++ b/tests/test_walberla.py @@ -12,9 +12,8 @@ from os.path import dirname, join import numpy as np -import lbmpy_walberla import pystencils -from lbmpy.creationfunctions import create_lb_collision_rule, create_lbm_kernel +from lbmpy.creationfunctions import create_lb_collision_rule from pystencils.astnodes import Block, EmptyLine, SympyAssignment from pystencils.data_types import TypedSymbol, create_type from pystencils_autodiff._file_io import write_file @@ -83,6 +82,8 @@ def test_wald_wiesen_lbm(): lbm_step = ldc_setup(domain_size=(30, 30), optimization=opt_params, fixed_loop_sizes=False, lid_velocity=lid_velocity) + # del lbm_step.data_handling.gpu_arrays.ldc_pdf_tmp + sim = WaldUndWiesenSimulation(lbm_step.data_handling, ctx, lbm_step.boundary_handling,