diff --git a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h index 618bee66e5ac50ea0e97069e15ab3d53ae7db873..a37f4be4b5fe7440f08116ab25ab30dd3e0b8111 100644 --- a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h +++ b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h @@ -40,7 +40,8 @@ using Stencil_T = StorageSpecification_T::Stencil; using namespace walberla; void createSetupBlockForest(SetupBlockForest& setupBfs, - const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup) + const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup, + const bool useMPIManager=false) { WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") @@ -50,11 +51,14 @@ void createSetupBlockForest(SetupBlockForest& setupBfs, Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic"); const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1)); - const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses"); + uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses"); const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest"); const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false); const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false); + if(useMPIManager) + numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses()); + const LDC ldc(refinementDepth); auto refSelection = ldc.refinementSelector(); @@ -64,6 +68,9 @@ void createSetupBlockForest(SetupBlockForest& setupBfs, setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); + if(mpi::MPIManager::instance()->numProcesses() > 1) + return; + { std::ostringstream oss; oss << blockForestFilestem << ".bfs"; @@ -101,4 +108,36 @@ void createSetupBlockForest(SetupBlockForest& setupBfs, WALBERLA_LOG_INFO_ON_ROOT("================================================================================="); } +} + +void createBlockForest(shared_ptr< BlockForest >& bfs, + const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup) +{ + if (mpi::MPIManager::instance()->numProcesses() > 1) + { + const std::string blockForestFilestem = + blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest"); + // Load structured block forest from file + std::ostringstream oss; + oss << blockForestFilestem << ".bfs"; + const std::string setupBlockForestFilepath = oss.str(); + if(!std::filesystem::exists(setupBlockForestFilepath.c_str())) + { + WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!") + SetupBlockForest setupBfs; + createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true); + bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + } + else + { + bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), + setupBlockForestFilepath.c_str(), false); + } + } + else + { + SetupBlockForest setupBfs; + createSetupBlockForest(setupBfs, domainSetup, blockForestSetup); + bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + } } \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h index 6fe5a508965e61e6bb22cc9cef82a4520096b4ee..2548dd62ef84fa5b397fa0b98525117f1ac4376f 100644 --- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h +++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h @@ -46,20 +46,32 @@ class LDCRefinement void operator()(SetupBlockForest& forest) const { - const AABB & domain = forest.getDomain(); - - const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() ); - const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() ); + //const AABB & domain = forest.getDomain(); +// +// const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() ); +// const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() ); +// +// for(auto & block : forest) +// { +// auto & aabb = block.getAABB(); +// if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) +// { +// if( block.getLevel() < refinementDepth_) +// block.setMarker( true ); +// } +// } for(auto & block : forest) { - auto & aabb = block.getAABB(); - if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) + if( forest.atDomainYMaxBorder(block) ) { if( block.getLevel() < refinementDepth_) block.setMarker( true ); } } + + + } }; diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp index aed74d7523cfdc5f630dc5e8ae7e30c07973324f..f6d36fff4ade177a44875777659710ef1f58e09f 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -93,6 +93,7 @@ int main(int argc, char** argv) logging::configureLogging(config); auto domainSetup = config->getOneBlock("DomainSetup"); auto blockForestSetup = config->getOneBlock("SetupBlockForest"); + const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true); Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); // Reading parameters @@ -103,33 +104,12 @@ int main(int argc, char** argv) const bool gpuEnabledMPI = parameters.getParameter< bool >("gpuEnabledMPI", false); shared_ptr< BlockForest > bfs; + createBlockForest(bfs, domainSetup, blockForestSetup); - if (mpi::MPIManager::instance()->numProcesses() > 1) + if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1) { - const std::string blockForestFilestem = - blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest"); - // Load structured block forest from file - std::ostringstream oss; - oss << blockForestFilestem << ".bfs"; - const std::string setupBlockForestFilepath = oss.str(); - - bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), - setupBlockForestFilepath.c_str(), false); - } - else - { - SetupBlockForest setupBfs; - createSetupBlockForest(setupBfs, domainSetup, blockForestSetup); - const bool writeSetupForestAndReturn = - blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true); - - if (writeSetupForestAndReturn) - { - WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program") - return EXIT_SUCCESS; - } - - bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program") + return EXIT_SUCCESS; } auto blocks = @@ -211,7 +191,7 @@ int main(int argc, char** argv) LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo); SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); - // LBMMeshRefinement.test(5); + LBMMeshRefinement.test(1); // return EXIT_SUCCESS; LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py index 89d5e448c3dedd07ab09307961efe576dfb15942..5c74463c90cae3e0b2dc5c26d0014ddd0ec1012f 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py @@ -30,13 +30,13 @@ const char * infoCollisionSetup = "{collision_setup}"; const bool infoCseGlobal = {cse_global}; const bool infoCsePdfs = {cse_pdfs}; """ - + with CodeGeneration() as ctx: field_type = "float64" if ctx.double_accuracy else "float32" - streaming_pattern = 'aa' + streaming_pattern = 'esopull' timesteps = get_timesteps(streaming_pattern) - stencil = LBStencil(Stencil.D3Q19) + stencil = LBStencil(Stencil.D3Q27) assert stencil.D == 3, "This application supports only three-dimensional stencils" pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx') diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py index d17a69f56f0374eb7a63106cfa9ba8b11a013e68..3ecd2b8b08c14069d40cb3d4042fb69f7c96f227 100644 --- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -22,7 +22,7 @@ class Scenario: timesteps=101, gpu_enabled_mpi=False, vtk_write_frequency=0, - logger_frequency=0, + logger_frequency=30, blockforest_filestem="blockforest", write_setup_vtk=True): @@ -66,7 +66,7 @@ class Scenario: 'timesteps': self.timesteps, 'remainingTimeLoggerFrequency': self.logger_frequency, 'vtkWriteFrequency': self.vtk_write_frequency, - 'useVTKAMRWriter': True, + 'useVTKAMRWriter': False, 'oneFilePerProcess': False, 'gpuEnabledMPI': self.gpu_enabled_mpi, }, @@ -120,8 +120,8 @@ def validation_run(): """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" wlb.log_info_on_root("Validation run") - domain_size = (96, 96, 96) - cells_per_block = (32, 32, 32) + domain_size = (48, 48, 48) + cells_per_block = (16, 16, 16) root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) @@ -129,6 +129,8 @@ def validation_run(): scenario = Scenario(domain_size=domain_size, root_blocks=root_blocks, cells_per_block=cells_per_block, + timesteps=10001, + vtk_write_frequency=2500, refinement_depth=1, gpu_enabled_mpi=False) scenarios.add(scenario) @@ -149,7 +151,7 @@ def scaling(num_proc, gpu_enabled_mpi=False, uniform=True): factor = int(num_proc // 4) name = "nonuniform" - cells_per_block = (184, 184, 184) + cells_per_block = (136, 136, 136) domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor) root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) @@ -166,5 +168,5 @@ def scaling(num_proc, gpu_enabled_mpi=False, uniform=True): scenarios.add(scenario) -# validation_run() -scaling(4, True, False) +validation_run() +# scaling(4, False, False) diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py index 8a8728031cd4eb56ba4366e25ef6fd85b1f2e1b5..c977e3b30f7862a5133399602ef34ae1931547a4 100644 --- a/python/lbmpy_walberla/packing_kernels.py +++ b/python/lbmpy_walberla/packing_kernels.py @@ -7,6 +7,8 @@ import sympy as sp from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils import Assignment, CreateKernelConfig, create_kernel, Field, FieldType, fields, Target +from pystencils.astnodes import LoopOverCoordinate +from pystencils.integer_functions import int_div from pystencils.stencil import offset_to_direction_string from pystencils.typing import TypedSymbol from pystencils.stencil import inverse_direction @@ -18,7 +20,7 @@ from lbmpy.enums import Stencil from lbmpy.stencils import LBStencil from pystencils_walberla.cmake_integration import CodeGenerationContext -from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode +from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode, AbortNode from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env from pystencils_walberla.utility import config_from_context @@ -115,6 +117,8 @@ class PackingKernelsCodegen: def create_nonuniform_kernel_families(self, kernels_dict=None): kernels = dict() if kernels_dict is None else kernels_dict + kernels['localCopyRedistribute'] = self.get_local_copy_redistribute_kernel_family() + kernels['localPartialCoalescence'] = self.get_local_copy_partial_coalescence_kernel_family() kernels['unpackRedistribute'] = self.get_unpack_redistribute_kernel_family() kernels['packPartialCoalescence'] = self.get_pack_partial_coalescence_kernel_family() kernels['zeroCoalescenceRegion'] = self.get_zero_coalescence_region_kernel_family() @@ -283,15 +287,59 @@ class PackingKernelsCodegen: return create_kernel(assignments, config=config) def get_unpack_redistribute_kernel_family(self): - return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast) + return self._construct_directionwise_kernel_family(self.get_unpack_redistribute_ast, + exclude_time_step=Timestep.EVEN) def get_local_copy_redistribute_ast(self, comm_dir, timestep): - # TODO - raise NotImplementedError() + assert not all(d == 0 for d in comm_dir) + ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)] + + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(inverse_direction(comm_dir)) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + if len(dir_indices) == 0: + return None + + # for inplace streaming the dst (fine grid) must always be on odd state + dst_timestep = Timestep.ODD if self.inplace else Timestep.BOTH + + _, dst = self._stream_out_accs(dst_timestep) + src, _ = self._stream_out_accs(timestep) + + src_abs = self.src_field.new_field_with_different_name(self.src_field.name) + src_abs.field_type = FieldType.CUSTOM + + orthos = self.orthogonal_principals(comm_dir) + sub_dirs = self.contained_principals(comm_dir) + orthogonal_combinations = self.linear_combinations(orthos) + subdir_combinations = self.linear_combinations_nozero(sub_dirs) + second_gl_dirs = [o + s for o, s in product(orthogonal_combinations, subdir_combinations)] + negative_dir_correction = np.array([(1 if d == -1 else 0) for d in comm_dir]) + assignments = [] + for offset in orthogonal_combinations: + o = offset + negative_dir_correction + for d in range(self.values_per_cell): + field_acc = dst[d].get_shifted(*o) + src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)] + assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, )))) + + for offset in second_gl_dirs: + o = offset + negative_dir_correction + for d in dir_indices: + field_acc = dst[d].get_shifted(*o) + src_access = [int_div(ctr[i], 2) + o for i, o in enumerate(src[d].offsets)] + assignments.append(Assignment(field_acc, src_abs.absolute_access(src_access, (d, )))) + + function_name = f'localCopyRedistribute_{dir_string}' + timestep_suffix(timestep) + iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim)) + config = CreateKernelConfig(function_name=function_name, iteration_slice=iteration_slice, + data_type=self.data_type, ghost_layers=0, allow_double_writes=True, + cpu_openmp=self.config.cpu_openmp, target=self.config.target) + + return create_kernel(assignments, config=config) def get_local_copy_redistribute_kernel_family(self): - # TODO - raise NotImplementedError() + return self._construct_directionwise_kernel_family(self.get_local_copy_redistribute_ast) # --------------------------- Pack / Unpack / LocalCopy Fine to Coarse --------------------------------------------- @@ -322,7 +370,8 @@ class PackingKernelsCodegen: return ast def get_pack_partial_coalescence_kernel_family(self): - return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast) + return self._construct_directionwise_kernel_family(self.get_pack_partial_coalescence_ast, + exclude_time_step=Timestep.ODD) def get_unpack_coalescence_ast(self, comm_dir, timestep): config = replace(self.config, ghost_layers=0) @@ -370,12 +419,53 @@ class PackingKernelsCodegen: def get_zero_coalescence_region_kernel_family(self): return self._construct_directionwise_kernel_family(self.get_zero_coalescence_region_ast) - # TODO def get_local_copy_partial_coalescence_ast(self, comm_dir, timestep): - raise NotImplementedError() + assert not all(d == 0 for d in comm_dir) + ctr = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self.stencil.D)] + + dir_string = offset_to_direction_string(comm_dir) + streaming_dirs = self.get_streaming_dirs(comm_dir) + dir_indices = sorted(self.stencil.index(d) for d in streaming_dirs) + + if len(dir_indices) == 0: + return None + buffer = sp.symbols(f"b_:{self.values_per_cell}") + + # for inplace streaming the src (fine grid) must always be on even state + src_timestep = Timestep.ODD if self.inplace else Timestep.BOTH + + src, _ = self._stream_in_accs(src_timestep) + _, dst = self._stream_in_accs(timestep.next()) + mask = self.mask_field + + dst_abs = self.dst_field.new_field_with_different_name(self.dst_field.name) + dst_abs.field_type = FieldType.CUSTOM + + coalescence_factor = sp.Rational(1, 2 ** self.dim) + + offsets = list(product(*((0, 1) for _ in comm_dir))) + assignments = [] + for i, d in enumerate(dir_indices): + acc = 0 + for o in offsets: + acc += flag_cond(d, mask[o], src[d].get_shifted(*o)) + assignments.append(Assignment(buffer[i], acc)) + + for i, d in enumerate(dir_indices): + index = dst[d].index + dst_access = [int_div(ctr[i], 2) + o for i, o in enumerate(dst[d].offsets)] + assignments.append(Assignment(dst_abs.absolute_access(dst_access, index), + dst_abs.absolute_access(dst_access, index) + coalescence_factor * buffer[i])) + + iteration_slice = tuple(slice(None, None, 2) for _ in range(self.dim)) + config = replace(self.config, iteration_slice=iteration_slice, ghost_layers=0) + + ast = create_kernel(assignments, config=config) + ast.function_name = f'localPartialCoalescence_{dir_string}' + timestep_suffix(timestep) + return ast def get_local_copy_partial_coalescence_kernel_family(self): - raise NotImplementedError() + return self._construct_directionwise_kernel_family(self.get_local_copy_partial_coalescence_ast) # ------------------------------------------ Utility --------------------------------------------------------------- @@ -425,7 +515,7 @@ class PackingKernelsCodegen: # --------------------------- Private Members ---------------------------------------------------------------------- - def _construct_directionwise_kernel_family(self, create_ast_callback): + def _construct_directionwise_kernel_family(self, create_ast_callback, exclude_time_step=None): subtrees = [] direction_symbol = TypedSymbol('dir', dtype='stencil::Direction') for t in get_timesteps(self.streaming_pattern): @@ -439,7 +529,10 @@ class PackingKernelsCodegen: continue kernel_call = KernelCallNode(ast) cases_dict[f"stencil::{dir_string}"] = kernel_call - subtrees.append(SwitchNode(direction_symbol, cases_dict)) + if exclude_time_step is not None and t == exclude_time_step: + subtrees.append(AbortNode("This function can not be called! Please contact the waLBerla team")) + else: + subtrees.append(SwitchNode(direction_symbol, cases_dict)) if not self.inplace: tree = subtrees[0] diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp index 92106cddf0248c93707e6881b600b6a702d27985..caafd77c2e9506784e0451333d86b0c322420306 100644 --- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp +++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp @@ -42,6 +42,8 @@ namespace {{namespace}} { {{ kernels['localCopyDirection'] | generate_definitions }} {% if nonuniform -%} + {{ kernels['localCopyRedistribute'] | generate_definitions }} + {{ kernels['localPartialCoalescence'] | generate_definitions }} {{ kernels['unpackRedistribute'] | generate_definitions }} {{ kernels['packPartialCoalescence'] | generate_definitions }} {{ kernels['zeroCoalescenceRegion'] | generate_definitions }} @@ -89,7 +91,7 @@ namespace {{namespace}} { WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) {{kernels['localCopyAll'] - | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }} } @@ -128,11 +130,31 @@ namespace {{namespace}} { WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize()) {{kernels['localCopyDirection'] - | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream') + | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }} } {% if nonuniform -%} + void {{class_name}}::PackKernels::localCopyRedistribute( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{kernels['localCopyRedistribute'] | generate_call(cell_interval={src_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }} + } + + void {{class_name}}::PackKernels::localPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream"] if is_gpu else []] + | type_identifier_list -}} + ) const + { + {{kernels['localPartialCoalescence'] | generate_call(cell_interval={src_field.name : 'srcInterval', mask_field.name : 'srcInterval', dst_field.name : 'dstInterval'}, stream='stream') | indent(6) }} + } + void {{class_name}}::PackKernels::unpackRedistribute( {{- [ "PdfField_T * " + dst_field.name, "CellInterval & ci", "unsigned char * inBuffer", kernels['unpackDirection'].kernel_selection_parameters, diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h index ad40a55eef10c08deb66e6266753a828be9f3452..5ca6f66595d296a071eb2e71f0f603c71f6ac640 100644 --- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h +++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h @@ -209,6 +209,26 @@ class {{class_name}} {% if nonuniform -%} + /** + * Local uniform redistribute. + * */ + void localCopyRedistribute( + {{- [ "PdfField_T * " + src_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localCopyRedistribute'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + + /** + * Local partial coalescence. + * */ + void localPartialCoalescence( + {{- [ "PdfField_T * " + src_field.name, "MaskField_T * " + mask_field.name, "CellInterval & srcInterval", + "PdfField_T * " + dst_field.name, "CellInterval & dstInterval", kernels['localPartialCoalescence'].kernel_selection_parameters, + ["gpuStream_t stream = nullptr"] if is_gpu else []] + | type_identifier_list -}} + ) const; + /** * Unpacks and uniformly redistributes populations coming from a coarse block onto the fine grid. * */ diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index 6d05bf8ffd51808821311bf09f0db263570983b9..66384f790e9ad390effd5d72d4de57c6750082a2 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -296,7 +296,7 @@ def generate_call(ctx, kernel, ghost_layers_to_include=0, cell_interval=None, st if isinstance(cell_interval, str): return cell_interval elif isinstance(cell_interval, dict): - return cell_interval[field_object] + return cell_interval[field_object.name] else: return None diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py index ad8a99867e0970b102409823f6a17258983bae2b..a41f7b93595a9484ae6424f6c85e528ebc2ce4c0 100644 --- a/python/pystencils_walberla/kernel_selection.py +++ b/python/pystencils_walberla/kernel_selection.py @@ -157,6 +157,21 @@ class SwitchNode(AbstractKernelSelectionNode): return switch_code +class AbortNode(AbstractKernelSelectionNode): + def __init__(self, message): + self.message = message + + @property + def selection_parameters(self): + return set() + + def collect_kernel_calls(self): + return set() + + def get_code(self, **kwargs): + return f'WALBERLA_ABORT("{self.message}")' + + class KernelCallNode(AbstractKernelSelectionNode): def __init__(self, ast): self.ast = ast diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h index b872be1d0c80e3537971b49434d64033373a1822..f0d8c53b6fae443ee6e1244161b7e37f77e75179 100644 --- a/src/gpu/communication/NonUniformGPUScheme.h +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -111,7 +111,6 @@ class NonUniformGPUScheme std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_; std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_; - std::vector< std::vector< GpuBuffer_T > > localBuffer_; GpuBuffer_T adaptiveGPUBuffer; std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_; @@ -174,7 +173,6 @@ void NonUniformGPUScheme< Stencil >::init() { bufferSystemCPU_.resize(3); bufferSystemGPU_.resize(3); - localBuffer_.resize(3); headers_.resize(3); communicationInProgress_.resize(3); @@ -201,7 +199,6 @@ void NonUniformGPUScheme< Stencil >::refresh() { bufferSystemCPU_[i].clear(); bufferSystemGPU_[i].clear(); - localBuffer_[i].clear(); headers_[i].clear(); headers_[i].resize(size_t(levels + uint_t(1))); @@ -210,7 +207,6 @@ void NonUniformGPUScheme< Stencil >::refresh() headers_[i][j].clear(); bufferSystemCPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); bufferSystemGPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); - localBuffer_[i].emplace_back(); } communicationInProgress_[i].resize(size_t(levels + uint_t(1)), false); @@ -370,10 +366,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i } } // wait for packing to finish - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) if (sendFromGPU_) @@ -433,13 +426,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t if( coarseBlock->neighborExistsLocally( neighborIdx, n ) ) { auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) ); - GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index]; - for (auto& pi : packInfos_) { - WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) - pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr); + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, nullptr); } } else @@ -469,15 +458,11 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t } } } - localBuffer_[COARSE_TO_FINE][index].clear(); } // wait for packing to finish - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) if (sendFromGPU_) bufferSystemGPU_[COARSE_TO_FINE][index].sendAll(); @@ -536,13 +521,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) ); - GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index]; - for (auto& pi : packInfos_) { - WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) - pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr); + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, nullptr); } } else @@ -571,13 +552,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t } } } - localBuffer_[FINE_TO_COARSE][index].clear(); } // wait for packing to finish - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) if (sendFromGPU_) bufferSystemGPU_[FINE_TO_COARSE][index].sendAll(); @@ -644,10 +621,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve } } } - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) communicationInProgress_[EQUAL_LEVEL][level] = false; } @@ -673,7 +647,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi GpuBuffer_T &gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], - gpuDataBuffer, streams_[0]); + gpuDataBuffer, nullptr); } } } @@ -697,17 +671,14 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[0])) - pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[0]); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr)) + pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, nullptr); } } } } - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) communicationInProgress_[COARSE_TO_FINE][fineLevel] = false; } @@ -766,10 +737,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi } } } - for (uint_t i = 0; i < Stencil::Q; ++i) - { - WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) - } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) communicationInProgress_[FINE_TO_COARSE][fineLevel] = false; } @@ -788,9 +756,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() std::vector< std::vector< shared_ptr< mpi::BufferSystem > > > headerExchangeBs; - std::vector< std::vector< mpi::MPISize > > localBufferSize; - - localBufferSize.resize(3); senderInfo.resize(3); receiverInfo.resize(3); @@ -809,10 +774,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() headerExchangeBs[EQUAL_LEVEL].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); headerExchangeBs[COARSE_TO_FINE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); headerExchangeBs[FINE_TO_COARSE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); - - localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0)); - localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0)); - localBufferSize[FINE_TO_COARSE].push_back(mpi::MPISize(0)); } for (auto& iBlock : *forest) @@ -868,8 +829,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() if( block->neighborExistsLocally( neighborIdx, n ) ) { - for (auto& pi : packInfos_) - localBufferSize[COARSE_TO_FINE][fineLevel] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir)); continue; } @@ -896,8 +855,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { - for (auto& pi : packInfos_) - localBufferSize[FINE_TO_COARSE][level] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); continue; } @@ -939,8 +896,6 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); } - if (localBufferSize[i][j] > 0) - localBuffer_[i][j].resize(size_t(localBufferSize[i][j])); } } forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h index 96c514fcc4369084273c098ac9bf4ad21310ae29..c1595e4ed6332fee6e0b885c6e455d3e0ffb99d0 100644 --- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h @@ -52,6 +52,12 @@ class NonuniformGPUPackingKernelsWrapper void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0; + void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0; + + void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval, + PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0; + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, stencil::Direction dir, gpuStream_t stream ) const = 0; @@ -108,6 +114,18 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, false > kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, stream); } + void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, dir, stream); + } + + void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval, + PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, stream); + } + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, stencil::Direction dir, gpuStream_t stream = nullptr) const { @@ -192,10 +210,34 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, true > kernels_.localCopyDirection(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream); } + void localCopyRedistribute(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, + CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT((dstField->getTimestep() & 1) ^ 1, "When the course to fine step is executed, the fine Field must " + "be on an odd timestep, while the source field could either be " + "on an even or an odd state.") + kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, dir, timestep, stream); + } + + void localPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval srcInterval, + PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const + { + uint8_t timestep = dstField->getTimestep(); + WALBERLA_ASSERT(!(srcField->getTimestep() & 1) ^ 1, "When the fine to coarse step is executed, the fine Field must " + "be on an even timestep, while the source field could either be " + "on an even or an odd state.") + + kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, timestep, stream); + } + void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, stencil::Direction dir, gpuStream_t stream = nullptr) const { uint8_t timestep = dstField->getTimestep(); + WALBERLA_ASSERT((dstField->getTimestep() & 1) ^ 1, "When the course to find step is executed, the fine Field must " + "be on an odd timestep, while the source field could either be " + "on an even or an odd state.") kernels_.unpackRedistribute(dstField, ci, inBuffer, dir, timestep, stream); } @@ -203,6 +245,10 @@ class NonuniformGPUPackingKernelsWrapper< PdfField_T, true > unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const { uint8_t timestep = srcField->getTimestep(); + WALBERLA_ASSERT(!(srcField->getTimestep() & 1) ^ 1, "When the fine to coarse step is executed, the fine Field must " + "be on an even timestep, while the source field could either be " + "on an even or an odd state.") + kernels_.packPartialCoalescence(srcField, maskField, ci, outBuffer, dir, timestep, stream); } diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h index 987cebe9b2bfd343ed0277ed3faefef4dddaa753..3fe29511c3758a1543b02e5c4b28c971e939621a 100644 --- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h @@ -168,21 +168,13 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi Direction const unpackDir = dstIntervals[index].first; CellInterval dstInterval = dstIntervals[index].second; - uint_t packSize = kernels_.size(srcInterval); - #ifndef NDEBUG Direction const packDir = srcIntervals[index].first; WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) uint_t unpackSize = kernels_.redistributeSize(dstInterval); - WALBERLA_ASSERT_EQUAL(packSize, unpackSize) + WALBERLA_ASSERT_EQUAL(kernels_.size(srcInterval), unpackSize) #endif - - // TODO: This is a dirty workaround. Code-generate direct redistribution! - unsigned char *buffer; - WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) - kernels_.packAll(srcField, srcInterval, buffer, stream); - kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir, stream); - WALBERLA_GPU_CHECK(gpuFree(buffer)) + kernels_.localCopyRedistribute(srcField, srcInterval, dstField, dstInterval, unpackDir, stream); } } @@ -190,6 +182,9 @@ template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine( const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) { + // WARNING: This function uses an inplace buffer array. + // If possible the direct communicateLocalCoarseToFine without buffer array should be used + auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); @@ -269,22 +264,16 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar CellInterval srcInterval; srcField->getGhostRegion(dir, srcInterval, 2); - uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), invDir, dstField); #ifndef NDEBUG uint_t unpackSize = kernels_.size(dstInterval, invDir); - WALBERLA_ASSERT_EQUAL(packSize, unpackSize) + WALBERLA_ASSERT_EQUAL(kernels_.partialCoalescenceSize(srcInterval, dir), unpackSize) #endif - // TODO: This is a dirty workaround. Code-generate direct redistribution! - unsigned char *buffer; - WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) - kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir, stream); - kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir, stream); - WALBERLA_GPU_CHECK(gpuFree(buffer)) + kernels_.localPartialCoalescence(srcField, maskField, srcInterval, dstField, dstInterval, dir, stream); } @@ -311,6 +300,19 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar WALBERLA_ASSERT_EQUAL(packSize, unpackSize) #endif +// WALBERLA_ROOT_SECTION() +// { +// WALBERLA_LOG_DEVEL_VAR(stencil::dirToString[dir]) +// WALBERLA_LOG_DEVEL_VAR(unsigned(srcField->getTimestep())) +// WALBERLA_LOG_DEVEL_VAR(srcInterval) +// +// WALBERLA_LOG_DEVEL_VAR(stencil::dirToString[invDir]) +// WALBERLA_LOG_DEVEL_VAR(unsigned(dstField->getTimestep())) +// WALBERLA_LOG_DEVEL_VAR(dstInterval) +// } + + + auto bufferPtr = buffer.advanceNoResize(packSize); kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, stream); kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream);