diff --git a/generate_packinfo.py b/generate_packinfo.py index 4d9bbd08128c691ee60ab988b67ad66202dca3fb..d3d30dd54d977f34e80c31331fce6b6d3cd6b32c 100644 --- a/generate_packinfo.py +++ b/generate_packinfo.py @@ -83,8 +83,8 @@ def generate_pack_info(class_name: str, **create_kernel_params) unpack_ast.function_name = 'unpack_{}'.format("_".join(inv_direction_string)) - pack_kernels[direction_strings] = KernelInfo(pack_ast, [], [], []) - unpack_kernels[inv_direction_string] = KernelInfo(unpack_ast, [], [], []) + pack_kernels[direction_strings] = KernelInfo(pack_ast) + unpack_kernels[inv_direction_string] = KernelInfo(unpack_ast) elements_per_cell[direction_strings] = len(terms) fused_kernel = create_kernel([Assignment(buffer.center, t) for t in all_accesses], **create_kernel_params) @@ -93,7 +93,7 @@ def generate_pack_info(class_name: str, 'class_name': class_name, 'pack_kernels': pack_kernels, 'unpack_kernels': unpack_kernels, - 'fused_kernel': KernelInfo(fused_kernel, [], [], []), + 'fused_kernel': KernelInfo(fused_kernel), 'elements_per_cell': elements_per_cell, 'target': target, 'dtype': dtype, diff --git a/jinja_filters.py b/jinja_filters.py index 8bc31cdf8c501043d25348a55e3ead38d9bf0985..ceb20d7cc41fb31c490e482cd3e713315981eb5c 100644 --- a/jinja_filters.py +++ b/jinja_filters.py @@ -1,12 +1,10 @@ import sympy as sp import jinja2 -import copy - from pystencils import TypedSymbol -from pystencils.astnodes import ResolvedFieldAccess from pystencils.data_types import get_base_type from pystencils.backends.cbackend import generate_c, CustomSympyPrinter -from pystencils.field import FieldType, Field +from pystencils.field import FieldType +from pystencils.kernelparameters import SHAPE_DTYPE from pystencils.sympyextensions import prod temporary_fieldMemberTemplate = """ @@ -34,54 +32,35 @@ def make_field_type(dtype, f_size, is_gpu): return "GhostLayerField<%s, %d>" % (dtype, f_size) -def get_field_fsize(field, field_accesses=()): - if field.has_fixed_index_shape and field.index_dimensions > 0: - return prod(field.index_shape) - elif field.index_dimensions == 0: +def get_field_fsize(field): + """Determines the size of the index coordinate. Since walberla fields only support one index dimension, + pystencils fields with multiple index dimensions are linearized to a single index dimension. + """ + assert field.has_fixed_index_shape, \ + "All Fields have to be created with fixed index coordinate shape using index_shape=(q,) " + str(field.name) + + if field.index_dimensions == 0: return 1 else: - assert len(field_accesses) > 0 - assert field.index_dimensions == 1 - max_idx_value = 0 - for acc in field_accesses: - if acc.field == field and acc.idx_coordinate_values[0] > max_idx_value: - max_idx_value = acc.idx_coordinate_values[0] - return max_idx_value + 1 + return prod(field.index_shape) -@jinja2.contextfilter -def generate_declaration(ctx, kernel_info): +def generate_declaration(kernel_info): """Generates the declaration of the kernel function""" - is_gpu = ctx['target'] == 'gpu' ast = kernel_info.ast - if is_gpu: - params_in_constant_mem = [p for p in ast.parameters if p.is_field_stride_argument or p.is_field_shape_argument] - ast.global_variables.update([p.name for p in params_in_constant_mem]) - result = generate_c(ast, signature_only=True) + ";" result = "namespace internal_%s {\n%s\n}" % (ast.function_name, result,) return result -@jinja2.contextfilter -def generate_definition(ctx, kernel_info): +def generate_definition(kernel_info): """Generates the definition (i.e. implementation) of the kernel function""" - is_gpu = ctx['target'] == 'gpu' ast = kernel_info.ast - if is_gpu: - params_in_constant_mem = [p for p in ast.parameters if p.is_field_stride_argument or p.is_field_shape_argument] - ast = copy.deepcopy(ast) - ast.global_variables.update([p.symbol for p in params_in_constant_mem]) - prefix = ["__constant__ %s %s[4];" % (get_base_type(p.dtype).base_name, p.name) for p in params_in_constant_mem] - prefix = "\n".join(prefix) - else: - prefix = "" - result = generate_c(ast) - result = "namespace internal_%s {\n%s\nstatic %s\n}" % (ast.function_name, prefix, result) + result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result) return result -def field_extraction_code(field_accesses, field_name, is_temporary, declaration_only=False, +def field_extraction_code(field, is_temporary, declaration_only=False, no_declaration=False, is_gpu=False): """Returns code string for getting a field pointer. @@ -89,18 +68,16 @@ def field_extraction_code(field_accesses, field_name, is_temporary, declaration_ created. Args: - field_accesses: set of Field.Access objects of a kernel - field_name: the field name for which the code should be created + field: the field for which the code should be created is_temporary: new_filtered field from block (False) or create a temporary copy of an existing field (True) declaration_only: only create declaration instead of the full code no_declaration: create the extraction code, and assume that declarations are elsewhere is_gpu: if the field is a GhostLayerField or a GpuField """ - fields = {fa.field.name: fa.field for fa in field_accesses} - field = fields[field_name] # Determine size of f coordinate which is a template parameter - f_size = get_field_fsize(field, field_accesses) + f_size = get_field_fsize(field) + field_name = field.name dtype = get_base_type(field.dtype) field_type = make_field_type(dtype, f_size, is_gpu) @@ -125,52 +102,44 @@ def field_extraction_code(field_accesses, field_name, is_temporary, declaration_ @jinja2.contextfilter -def generate_field_parameters(ctx, kernel_info, parameters_to_ignore=[]): - is_gpu = ctx['target'] == 'gpu' - ast = kernel_info.ast - fields = sorted(list(ast.fields_accessed), key=lambda f: f.name) - field_accesses = ast.atoms(ResolvedFieldAccess) - - return ", ".join(["%s * %s" % (make_field_type(get_base_type(f.dtype), - get_field_fsize(f, field_accesses), - is_gpu), f.name) - for f in fields if f.name not in parameters_to_ignore]) - -@jinja2.contextfilter -def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_ignore=[], parameters=None, +def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_ignore=(), parameters=None, declarations_only=False, no_declarations=False): - ast = kernel_info.ast - field_accesses = [a for a in ast.atoms(ResolvedFieldAccess) if a.field.name not in parameters_to_ignore] - + """Generates code that extracts all required fields of a kernel from a walberla block storage.""" if parameters is not None: - assert parameters_to_ignore == [] + assert parameters_to_ignore == () + field_parameters = [] + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name in parameters: + field_parameters.append(param.fields[0]) else: - parameters = {p.field_name for p in ast.parameters if p.is_field_ptr_argument} - parameters.difference_update(parameters_to_ignore) + field_parameters = [] + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_ignore: + field_parameters.append(param.fields[0]) - normal = {f for f in parameters if f not in kernel_info.temporary_fields} - temporary = {f for f in parameters if f in kernel_info.temporary_fields} + normal_fields = {f for f in field_parameters if f.name not in kernel_info.temporary_fields} + temporary_fields = {f for f in field_parameters if f.name in kernel_info.temporary_fields} args = { - 'field_accesses': field_accesses, 'declaration_only': declarations_only, 'no_declaration': no_declarations, 'is_gpu': ctx['target'] == 'gpu', } - result = "\n".join(field_extraction_code(field_name=fn, is_temporary=False, **args) for fn in normal) + "\n" - result += "\n".join(field_extraction_code(field_name=fn, is_temporary=True, **args) for fn in temporary) + result = "\n".join(field_extraction_code(field=field, is_temporary=False, **args) for field in normal_fields) + "\n" + result += "\n".join(field_extraction_code(field=field, is_temporary=True, **args) for field in temporary_fields) return result def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore): - symbols = {p.field_name for p in kernel_info.ast.parameters if p.is_field_ptr_argument} - symbols.update(p.name for p in kernel_info.ast.parameters if not p.is_field_argument) + symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer} + symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter) symbols.difference_update(parameters_to_ignore) return "\n".join("auto & %s = %s%s;" % (s, prefix, s) for s in symbols) @jinja2.contextfilter -def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=None, stream='0', spatial_shape_symbols=[]): +def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=None, stream='0', + spatial_shape_symbols=()): """Generates the function call to a pystencils kernel Args: @@ -184,10 +153,15 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non that defines the inner region for the kernel to loop over. Parameter has to be left to default if ghost_layers_to_include is specified. stream: optional name of cuda stream variable + spatial_shape_symbols: relevant only for gpu kernels - to determine CUDA block and grid sizes the iteration + region (i.e. field shape) has to be known. This can normally be inferred by the kernel + parameters - however in special cases like boundary conditions a manual specification + may be necessary. """ assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0 ast = kernel_info.ast - ast_params = kernel_info.ast.parameters + ast_params = kernel_info.parameters + is_cpu = ctx['target'] == 'cpu' ghost_layers_to_include = sp.sympify(ghost_layers_to_include) if ast.ghost_layers is None: @@ -196,18 +170,15 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. ) required_ghost_layers = max(max(ast.ghost_layers)) - is_cpu = ctx['target'] == 'cpu' - kernel_call_lines = [] - fields = {f.name: f for f in ast.fields_accessed} def get_start_coordinates(field_object): if cell_interval is None: return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions else: assert ghost_layers_to_include == 0 - return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord, ci=cell_interval)) - required_ghost_layers - for coord in ('x', 'y', 'z')] + return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers + for coord_name in ('x', 'y', 'z')] def get_end_coordinates(field_object): if cell_interval is None: @@ -216,43 +187,17 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non return ["%s->%s + %s" % (field_object.name, e, offset) for e in shape_names] else: assert ghost_layers_to_include == 0 - coord_names = ['x', 'y', 'z'][:field_object.spatial_dimensions] - return ["{ci}.{coord}Size() + {gl}".format(coord=coord, ci=cell_interval, gl=2 * required_ghost_layers) - for coord in coord_names] - - def create_field_shape_code(field, param_name, gpu_copy=True): - result = [] - type_str = get_base_type(Field.SHAPE_DTYPE).base_name - shapes = ["%s(%s)" % (type_str, c) for c in get_end_coordinates(field)] - - max_values = ["%s->%sSizeWithGhostLayer()" % (field.name, coord) for coord in ['x', 'y', 'z']] - for shape, max_value in zip(shapes, max_values): - result.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape)) - - if field.index_dimensions == 1: - shapes.append("%s(%s->fSize())" % (type_str, field.name)) - elif field.index_dimensions > 1: - shapes.extend(["%s(%d)" % (type_str, e) for e in field.index_shape]) - result.append("WALBERLA_ASSERT_EQUAL(int(%s->fSize()), %d);" % - (field.name, prod(field.index_shape))) - if is_cpu or not gpu_copy: - result.append("const %s %s [] = {%s};" % (type_str, param_name, ", ".join(shapes))) - else: - result.append("const %s %s_cpu [] = {%s};" % (type_str, param_name, ", ".join(shapes))) - result.append( - "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), " - "0, cudaMemcpyHostToDevice, %s) );" - % (ast.function_name, param_name, param_name, len(shapes), type_str, stream)) - return result + return ["{ci}.{coord}Size() + {gl}".format(coord=coord_name, ci=cell_interval, gl=2 * required_ghost_layers) + for coord_name in ('x', 'y', 'z')] for param in ast_params: - if param.is_field_argument and FieldType.is_indexed(fields[param.field_name]): + if param.is_field_parameter and FieldType.is_indexed(param.fields[0]): continue - if param.is_field_ptr_argument: - field = fields[param.field_name] + if param.is_field_pointer: + field = param.fields[0] if field.field_type == FieldType.BUFFER: - kernel_call_lines.append("%s %s = %s;" % (param.dtype, param.name, param.field_name)) + kernel_call_lines.append("%s %s = %s;" % (param.symbol.dtype, param.symbol.name, param.field_name)) else: coordinates = get_start_coordinates(field) actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, ) @@ -261,60 +206,34 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non (c, actual_gls)) while len(coordinates) < 4: coordinates.append(0) + coordinates = tuple(coordinates) kernel_call_lines.append("%s %s = %s->dataAt(%s, %s, %s, %s);" % - ((param.dtype, param.name, param.field_name) + tuple(coordinates))) - - elif param.is_field_stride_argument: - type_str = get_base_type(param.dtype).base_name - stride_names = ['xStride()', 'yStride()', 'zStride()', 'fStride()'] - stride_names = ["%s(%s->%s)" % (type_str, param.field_name, e) for e in stride_names] - field = fields[param.field_name] - strides = stride_names[:field.spatial_dimensions] - if field.index_dimensions > 0: - additional_strides = [1] - for shape in reversed(field.index_shape[1:]): - additional_strides.append(additional_strides[-1] * shape) - assert len(additional_strides) == field.index_dimensions - f_stride_name = stride_names[-1] - strides.extend(["%s(%d * %s)" % (type_str, e, f_stride_name) for e in reversed(additional_strides)]) - if is_cpu: - kernel_call_lines.append("const %s %s [] = {%s};" % (type_str, param.name, ", ".join(strides))) - else: - kernel_call_lines.append("const %s %s_cpu [] = {%s};" % (type_str, param.name, ", ".join(strides))) - kernel_call_lines.append( - "WALBERLA_CUDA_CHECK( cudaMemcpyToSymbolAsync(internal_%s::%s, %s_cpu, %d * sizeof(%s), " - "0, cudaMemcpyHostToDevice, %s) );" - % (ast.function_name, param.name, param.name, len(strides), type_str, stream)) - - elif param.is_field_shape_argument: - kernel_call_lines += create_field_shape_code(fields[param.field_name], param.name) - + ((param.symbol.dtype, param.symbol.name, param.field_name) + coordinates)) + elif param.is_field_stride: + type_str = param.symbol.dtype.base_name + stride_names = ('xStride()', 'yStride()', 'zStride()', 'fStride()') + casted_stride = "%s(%s->%s)" % (type_str, param.field_name, stride_names[param.symbol.coordinate]) + kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, casted_stride)) + elif param.is_field_shape: + coord = param.symbol.coordinate + field = param.fields[0] + type_str = param.symbol.dtype.base_name + shape = "%s(%s)" % (type_str, get_end_coordinates(field)[coord]) + assert coord < 3 + max_value = "%s->%sSizeWithGhostLayer()" % (field.name, ('x', 'y', 'z')[coord]) + kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape)) + kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, shape)) + + call_parameters = ", ".join([p.symbol.name for p in ast_params]) if not is_cpu: if not spatial_shape_symbols: - spatial_shape_symbols = [] - for param in ast_params: - if param.is_field_shape_argument: - spatial_shape_symbols = [TypedSymbol("%s_cpu[%d]" % (param.name, i), get_base_type(Field.SHAPE_DTYPE)) - for i in range(field.spatial_dimensions)] + spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] + spatial_shape_symbols.sort(key=lambda e: e.coordinate) else: - spatial_shape_symbols = [TypedSymbol(e, get_base_type(Field.SHAPE_DTYPE)) for e in spatial_shape_symbols] - - if not spatial_shape_symbols: - for param in ast_params: - if not param.is_field_argument: - continue - field = fields[param.field_name] - if field.field_type == FieldType.GENERIC: - kernel_call_lines += create_field_shape_code(field, '_size', gpu_copy=False) - spatial_shape_symbols = [TypedSymbol("_size[%d]" % (i, ), get_base_type(Field.SHAPE_DTYPE)) - for i in range(field.spatial_dimensions)] - break + spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) - call_parameters = ", ".join([p.name for p in ast_params - if p.is_field_ptr_argument or not p.is_field_argument]) sp_printer_c = CustomSympyPrinter() - kernel_call_lines += [ "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']), "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']), @@ -322,8 +241,7 @@ def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=Non stream, call_parameters), ] else: - kernel_call_lines.append("internal_%s::%s(%s);" % - (ast.function_name, ast.function_name, ", ".join([p.name for p in ast_params]))) + kernel_call_lines.append("internal_%s::%s(%s);" % (ast.function_name, ast.function_name, call_parameters)) return "\n".join(kernel_call_lines) @@ -335,56 +253,58 @@ def generate_swaps(kernel_info): return swaps -def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=[]): +def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=None): + if parameters_to_ignore is None: + parameters_to_ignore = [] + ast = kernel_info.ast parameters_to_ignore += kernel_info.temporary_fields parameter_initializer_list = [] - for param in ast.parameters: - if param.is_field_ptr_argument and param.field_name not in parameters_to_ignore: + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_ignore: parameter_initializer_list.append("%sID(%sID_)" % (param.field_name, param.field_name)) - elif not param.is_field_argument and param.name not in parameters_to_ignore: - parameter_initializer_list.append("%s(%s_)" % (param.name, param.name)) + elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: + parameter_initializer_list.append("%s(%s_)" % (param.symbol.name, param.symbol.name)) return ", ".join(parameter_initializer_list) -def generate_constructor_parameters(kernel_info, parameters_to_ignore=[]): - ast = kernel_info.ast +def generate_constructor_parameters(kernel_info, parameters_to_ignore=None): + if parameters_to_ignore is None: + parameters_to_ignore = [] + varying_parameters = [] if hasattr(kernel_info, 'varying_parameters'): varying_parameters = kernel_info.varying_parameters - varying_parameter_names = [e[1] for e in varying_parameters] + varying_parameter_names = tuple(e[1] for e in varying_parameters) parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names parameter_list = [] - for param in ast.parameters: - if param.is_field_ptr_argument and param.field_name not in parameters_to_ignore: + for param in kernel_info.parameters: + if param.is_field_pointer and param.field_name not in parameters_to_ignore: parameter_list.append("BlockDataID %sID_" % (param.field_name, )) - elif not param.is_field_argument and param.name not in parameters_to_ignore: - parameter_list.append("%s %s_" % (param.dtype, param.name,)) + elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore: + parameter_list.append("%s %s_" % (param.symbol.dtype, param.symbol.name,)) varying_parameters = ["%s %s_" % e for e in varying_parameters] return ", ".join(parameter_list + varying_parameters) @jinja2.contextfilter -def generate_members(ctx, kernel_info, parameters_to_ignore=None, only_fields=False): - if parameters_to_ignore is None: - parameters_to_ignore = [] - +def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False): ast = kernel_info.ast fields = {f.name: f for f in ast.fields_accessed} - params_to_skip = parameters_to_ignore + kernel_info.temporary_fields + params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields) is_gpu = ctx['target'] == 'gpu' result = [] - for param in ast.parameters: - if only_fields and not param.is_field_argument: + for param in kernel_info.parameters: + if only_fields and not param.is_field_parameter: continue - if param.is_field_ptr_argument and param.field_name not in params_to_skip: + if param.is_field_pointer and param.field_name not in params_to_skip: result.append("BlockDataID %sID;" % (param.field_name, )) - elif not param.is_field_argument and param.name not in params_to_skip: - result.append("%s %s;" % (param.dtype, param.name,)) + elif not param.is_field_parameter and param.symbol.name not in params_to_skip: + result.append("%s %s;" % (param.symbol.dtype, param.symbol.name,)) for field_name in kernel_info.temporary_fields: f = fields[field_name] @@ -409,4 +329,3 @@ def add_pystencils_filters_to_jinja_env(jinja_env): jinja_env.filters['generate_block_data_to_field_extraction'] = generate_block_data_to_field_extraction jinja_env.filters['generate_swaps'] = generate_swaps jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters - jinja_env.filters['generate_field_parameters'] = generate_field_parameters diff --git a/sweep.py b/sweep.py index 85bd5b5d4b8bc14744b0130859ad359883a0e5b6..e594529689df6b57c5c192551f62bfbdbd9e88a9 100644 --- a/sweep.py +++ b/sweep.py @@ -1,5 +1,4 @@ import sympy as sp -from collections import namedtuple import functools from jinja2 import Environment, PackageLoader @@ -7,7 +6,14 @@ from pystencils import kernel as kernel_decorator, create_staggered_kernel from pystencils import Field, SymbolCreator, create_kernel from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env -KernelInfo = namedtuple("KernelInfo", ['ast', 'temporary_fields', 'field_swaps', 'varying_parameters']) + +class KernelInfo: + def __init__(self, ast, temporary_fields=(), field_swaps=(), varying_parameters=()): + self.ast = ast + self.temporary_fields = tuple(temporary_fields) + self.field_swaps = tuple(field_swaps) + self.varying_parameters = tuple(varying_parameters) + self.parameters = ast.get_parameters() # cache parameters here class Sweep: @@ -81,24 +87,18 @@ class Sweep: ast = create_kernel(eqs, target=target, **optimization) ast.function_name = name - outer_kernels = {} - for dir_str in ('T', 'B', 'N', 'S', 'E', 'W'): - outer_kernel = create_kernel(eqs, target=target, **outer_optimization) - outer_kernel.function_name = name + "_" + dir_str - outer_kernels[dir_str] = KernelInfo(outer_kernel, temporary_fields, field_swaps, varying_parameters) - env = Environment(loader=PackageLoader('pystencils_walberla')) add_pystencils_filters_to_jinja_env(env) - representative_field = {p.field_name for p in ast.parameters if p.is_field_argument}.pop() + main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters) + representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter}.pop() context = { - 'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters), + 'kernel': main_kernel_info, 'namespace': namespace, 'class_name': ast.function_name[0].upper() + ast.function_name[1:], 'target': target, 'field': representative_field, - 'outer_kernels': outer_kernels, } header = env.get_template("SweepInnerOuter.tmpl.h").render(**context) @@ -121,7 +121,6 @@ class Sweep: file_names = [name + ".h", name + ('.cpp' if target == 'cpu' else '.cu')] codegen.register(file_names, callback) - @staticmethod def _generate_header_and_source(function_returning_assignments, name, target, namespace, temporary_fields, field_swaps, optimization, staggered, diff --git a/templates/SweepInnerOuter.tmpl.cpp b/templates/SweepInnerOuter.tmpl.cpp index 73f725420a28eed876950acec1cae472a40279bc..00fa1d802b03e63e3af23a9f48cf4ee182ff7ba3 100644 --- a/templates/SweepInnerOuter.tmpl.cpp +++ b/templates/SweepInnerOuter.tmpl.cpp @@ -45,11 +45,6 @@ namespace {{namespace}} { {{kernel|generate_definition}} -{% for outer_kernel in outer_kernels.values() %} -{{outer_kernel|generate_definition}} -{% endfor %} - - void {{class_name}}::operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} ) { {{kernel|generate_block_data_to_field_extraction|indent(4)}} @@ -100,28 +95,22 @@ void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaS layers.push_back(ci); } - {%if target is equalto 'gpu'%} - auto s = parallelStreams_.parallelSection( stream ); - {% endif %} - - {% for dir, ci in [('W', "layers[5]"), - ('E', 'layers[4]'), - ('S', 'layers[3]'), - ('N', 'layers[2]'), - ('B', 'layers[1]'), - ('T', 'layers[0]') ] %} - {%if target is equalto 'gpu'%} { - {{outer_kernels[dir]|generate_call(stream='s.stream()', cell_interval=ci)|indent(8)}} - s.next(); + auto parallelSection_ = parallelStreams_.parallelSection( stream ); + for( auto & ci: layers ) + { + parallelSection_.run([&]( auto s ) { + {{kernel|generate_call(stream='s', cell_interval='ci')|indent(16)}} + }); + } } {% else %} + for( auto & ci: layers ) { - {{outer_kernels[dir]|generate_call(cell_interval=ci)|indent(8)}} + {{kernel|generate_call(cell_interval='ci')|indent(8)}} } {% endif %} - {% endfor %} {{kernel|generate_swaps|indent(4)}} }