diff --git a/codegen.py b/codegen.py index 8f8e248dd75ff8274ae18b9d91b8f55463d439b5..fb7fc6e3edd8b47d722825089fa7c282accbf710 100644 --- a/codegen.py +++ b/codegen.py @@ -4,6 +4,7 @@ from itertools import product from typing import Dict, Sequence, Tuple, Optional from pystencils import create_staggered_kernel, Field, create_kernel, Assignment, FieldType +from pystencils.backends.cbackend import get_headers from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets from pystencils.stencils import offset_to_direction_string, inverse_direction from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env @@ -44,6 +45,7 @@ def generate_sweep(generation_context, class_name, assignments, 'namespace': namespace, 'class_name': class_name, 'target': create_kernel_params.get("target", "cpu"), + 'headers': get_headers(ast), } header = env.get_template("Sweep.tmpl.h").render(**jinja_context) source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) @@ -58,6 +60,7 @@ def generate_sweep(generation_context, class_name, assignments, 'class_name': class_name, 'target': create_kernel_params.get("target", "cpu"), 'field': representative_field, + 'headers': get_headers(ast), } header = env.get_template("SweepInnerOuter.tmpl.h").render(**jinja_context) source = env.get_template("SweepInnerOuter.tmpl.cpp").render(**jinja_context) @@ -193,6 +196,7 @@ def default_create_kernel_parameters(generation_context, params): vec = params['cpu_vectorize_info'] vec['instruction_set'] = vec.get('instruction_set', default_vec_is) + vec['assume_inner_stride_one'] = True vec['assume_aligned'] = vec.get('assume_aligned', False) vec['nontemporal'] = vec.get('nontemporal', False) return params diff --git a/jinja_filters.py b/jinja_filters.py index b96ba4bd15ed06e864a89cfca64030ec0f8eea36..c6b1f459823a8eda175eabf1fdfc3c4d9affe4c6 100644 --- a/jinja_filters.py +++ b/jinja_filters.py @@ -8,7 +8,7 @@ from pystencils.kernelparameters import SHAPE_DTYPE from pystencils.sympyextensions import prod temporary_fieldMemberTemplate = """ -std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;""" +private: std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;""" temporary_fieldTemplate = """ // Getting temporary field {tmp_field_name} @@ -53,6 +53,23 @@ def get_field_fsize(field): else: return prod(field.index_shape) + +def get_field_stride(param): + field = param.fields[0] + type_str = get_base_type(param.symbol.dtype).base_name + stride_names = ['xStride()', 'yStride()', 'zStride()', 'fStride()'] + stride_names = ["%s(%s->%s)" % (type_str, param.field_name, e) for e in stride_names] + strides = stride_names[:field.spatial_dimensions] + if field.index_dimensions > 0: + additional_strides = [1] + for shape in reversed(field.index_shape[1:]): + additional_strides.append(additional_strides[-1] * shape) + assert len(additional_strides) == field.index_dimensions + f_stride_name = stride_names[-1] + strides.extend(["%s(%d * %s)" % (type_str, e, f_stride_name) for e in reversed(additional_strides)]) + return strides[param.symbol.coordinate] + + def generate_declaration(kernel_info): """Generates the declaration of the kernel function""" ast = kernel_info.ast @@ -222,9 +239,8 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non kernel_call_lines.append("%s %s = %s->dataAt(%s, %s, %s, %s);" % ((param.symbol.dtype, param.symbol.name, param.field_name) + coordinates)) elif param.is_field_stride: + casted_stride = get_field_stride(param) type_str = param.symbol.dtype.base_name - stride_names = ('xStride()', 'yStride()', 'zStride()', 'fStride()') - casted_stride = "%s(%s->%s)" % (type_str, param.field_name, stride_names[param.symbol.coordinate]) kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, casted_stride)) elif param.is_field_shape: coord = param.symbol.coordinate @@ -327,6 +343,9 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu) result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name)) + if hasattr(kernel_info, 'varying_parameters'): + result.extend(["%s %s;" % e for e in kernel_info.varying_parameters]) + return "\n".join(result) diff --git a/templates/Sweep.tmpl.cpp b/templates/Sweep.tmpl.cpp index 4dc0b53ace664bf94e380e0578ea38046a8a0cd9..63d686e4a041bb9e8201843dc7d7ad13dc077d5a 100644 --- a/templates/Sweep.tmpl.cpp +++ b/templates/Sweep.tmpl.cpp @@ -23,6 +23,9 @@ #include "core/DataTypes.h" #include "core/Macros.h" #include "{{class_name}}.h" +{% for header in headers %} +#include {{header}} +{% endfor %} {% if target is equalto 'cpu' -%} @@ -36,6 +39,7 @@ # pragma GCC diagnostic ignored "-Wfloat-equal" # pragma GCC diagnostic ignored "-Wshadow" # pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wunused-variable" #endif using namespace std; @@ -43,15 +47,37 @@ using namespace std; namespace walberla { namespace {{namespace}} { + {{kernel|generate_definition}} -void {{class_name}}::operator() ( IBlock * block ) + +void {{class_name}}::sweep( IBlock * block ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} {{kernel|generate_call(stream='stream_')|indent(4)}} {{kernel|generate_swaps|indent(4)}} } + +void {{class_name}}::sweepOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks, + const CellInterval & globalCellInterval, + cell_idx_t ghostLayers, + IBlock * block ) +{ + CellInterval ci = globalCellInterval; + CellInterval blockBB = blocks->getBlockCellBB( *block); + blockBB.expand( ghostLayers ); + ci.intersect( blockBB ); + blocks->transformGlobalToBlockLocalCellInterval( ci, *block ); + if( ci.empty() ) + return; + + {{kernel|generate_block_data_to_field_extraction|indent(4)}} + {{kernel|generate_call(stream='stream_', cell_interval='ci')|indent(4)}} + {{kernel|generate_swaps|indent(4)}} +} + + } // namespace {{namespace}} } // namespace walberla diff --git a/templates/Sweep.tmpl.h b/templates/Sweep.tmpl.h index af3879c12681ca35b16137913e0bdd0f9c3c116d..325b9c57c6dff002c69a73c92c6c0cf1dacf180d 100644 --- a/templates/Sweep.tmpl.h +++ b/templates/Sweep.tmpl.h @@ -17,6 +17,7 @@ //! \\author pystencils //====================================================================================================================== +#pragma once #include "core/DataTypes.h" {% if target is equalto 'cpu' -%} @@ -27,7 +28,7 @@ #include "field/SwapableCompare.h" #include "domain_decomposition/BlockDataID.h" #include "domain_decomposition/IBlock.h" - +#include "domain_decomposition/StructuredBlockStorage.h" #include <set> #ifdef __GNUC__ @@ -56,9 +57,28 @@ public: {{ kernel| generate_destructor(class_name) |indent(4) }} - void operator() ( IBlock * block ); + void operator()(IBlock * b) { sweep(b); } + + std::function<void (IBlock*)> getSweep() { + return [this](IBlock * b) { this->sweep(b); }; + } + + std::function<void (IBlock*)> getSweepOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks, + const CellInterval & globalCellInterval, + cell_idx_t ghostLayers=1 ) + { + return [this, blocks, globalCellInterval, ghostLayers] (IBlock * b) { + this->sweepOnCellInterval(blocks, globalCellInterval, ghostLayers, b); + }; + } + + {{ kernel|generate_members|indent(4) }} + private: - {{kernel|generate_members|indent(4)}} + void sweep( IBlock * block ); + void sweepOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks, + const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block ); + {%if target is equalto 'gpu'%} cudaStream_t stream_; {% endif %}