diff --git a/hog/operator_generation/operators.py b/hog/operator_generation/operators.py index 11df36940a3ad78f28916e1e1ecce33b37db0522..3d9455bdf1f7a20e3a5dfc36fada84bffc76dfa2 100644 --- a/hog/operator_generation/operators.py +++ b/hog/operator_generation/operators.py @@ -14,13 +14,13 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import auto, Enum from functools import reduce import logging import os from textwrap import indent -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import cast, Dict, List, Optional, Set, Union import numpy as np import sympy as sp @@ -29,12 +29,12 @@ import pystencils as ps from sympy.codegen.ast import Assignment -from pystencils import AssignmentCollection, Kernel, Target, TypedSymbol -from pystencils.backend.ast import PsAstNode +from pystencils import AssignmentCollection, Kernel, Target from pystencils.backend.ast.analysis import OperationCounter, UndefinedSymbolsCollector -from pystencils.backend.ast.expressions import PsConstant, PsExpression, PsSymbolExpr +from pystencils.backend.ast.expressions import PsExpression from pystencils.backend.ast.iteration import dfs_preorder from pystencils.backend.ast.structural import PsBlock, PsLoop +from pystencils.backend.constants import PsConstant from pystencils.backend.kernelcreation import ( FreezeExpressions, KernelCreationContext, @@ -71,10 +71,8 @@ from hog.operator_generation.function_space_implementations.function_space_impl_ from hog.operator_generation.function_space_implementations.function_space_impl_factory import ( create_impl, ) -from hog.operator_generation.pystencils_extensions import create_generic_fields from hog.integrand import Form -from hog.ast import Operations from hog.blending import GeometryMap import hog.code_generation import hog.cse @@ -914,152 +912,6 @@ class HyTeGElementwiseOperator: return operator_cpp_file - def _compute_micro_element_coordinates( - self, - integration_info: IntegrationInfo, - element_index: List[sp.Symbol], - geometry: ElementGeometry, - ): # -> Tuple[List[int | sp.Symbol | Field.Access], List[CustomCodeNode]]: - """ - Computes coordinates of the micro-element. This is _not_ to be confused with the coordinates of the element's - vertices! - - We need to distinguish two purposes of the loop counters / element indices - they are used - (a) inside array accesses (i.e., for the computation of array indices - as integers) - (b) for the computation of coordinates (i.e., inside expressions - after being cast to a float type) - - Although we already have the (integer) symbols of the element as an input parameter, this method returns symbols - of the element indices that can be used for _computations_ - as opposed to array accesses. That is, we sort out - case (b). - - In short: - - use element_index for array accesses - - use what is returned from this method for computations that involve the element index (or loop counters - respectively, it is the same thing more or less) - - Why? Optimizations are not straightforward otherwise (includes purely affine transformations and especially - vectorization whenever there are loop-counter dependent expressions). - - :param integration_info: container that specifies various properties of the kernel - :param element_index: index of the micro-element, this is practically a list of symbols that should be equal to - the loop counter symbols - :param geometry: the element's geometry - :returns: a tuple of the element's vertex coordinates to be used for computations, and a list of CustomCodeNodes - that needs to be included at the beginning of the innermost loop's body - """ - - # This list is only filled if we want to vectorize. - loop_counter_custom_code_nodes = [] - - if ( - Opts.VECTORIZE not in integration_info.optimizations - and Opts.VECTORIZE512 not in integration_info.optimizations - ): - # The Jacobians are loop-counter dependent, and we do not care about vectorization. - # So we just use the indices. pystencils will handle casting them to float. - el_matrix_element_index = element_index.copy() - - else: - # Vectorization and loop-counter dependencies - # - # Easier said than done. - # - # At the time of writing this comment, pystencils` vectorizer has a problem with integers - # (or more general: with vectorizing loop counter-dependent expressions that are not array accesses). - # - # It follows a hack to circumvent this problem that can hopefully be removed as soon as pytencils` - # vectorizer is supporting such endeavors. - # Instead of directly vectorizing integer expressions, we use a CustomCodeNode to first cast the loop - # counters to the desired floating point format, and then write the cast loop counters to an array. - # The size of that array must be at least as large as the width of the respective vector data types - # (i.e. for AVX256 and 64bit counters it must be of length 4). - # Since the counters are now in an array (or more specifically a pystencils.Field data structure) - # the vectorizer just treats the (float) counters as any other data and can apply vectorization. - # - # For this to work, the array access (to the array that contains the cast loop counters) must contain - # the loop counter (otherwise it may be interpreted as a constant and moved in front of the loop). - # However, we only want the array to be of size <width-of-vector-datatype>. If we use the loop - # counter to access the array, at first glance one would think it needs to be as large as the - # iteration space. To avoid that, this hack introduces a "phantom" counter that is just set to the - # same value as the loop counter, and we subtract it from the access. We define that thing inside a - # CustomCodeNode, so it cannot be evaluated by sympy. - # - # Later we add the pystencils.Fields that we introduce here to the "global variables" of the - # Kernel object such that they do not end up in the kernel parameters. - # - # Whenever the loop counters are not part of expressions at all (e.g., when the Jacobians are - # constant on the entire macro) or when we do not vectorize, none of this is necessary. - # - # Phew. - - # Those are the fields that hold the loop counters. - # We need one for each loop (i.e., one for each dimension). - float_loop_ctr_arrays = create_generic_fields( - [ - s.name - for s in self.symbolizer.float_loop_ctr_array(geometry.dimensions) - ], - self._type_descriptor.pystencils_type, - ) - - # This is just some symbol that we set to the innermost loop ctr. - # However, we make sure sympy never sees this. - phantom_ctr = TypedSymbol("phantom_ctr_0", int) - - # Those are the actual counters (i.e. the coordinates of the micro-element) for use in expressions that - # are _not_ array accesses. We assign them to the first entry of the float loop counter array. The - # vectorizer can automatically assign multiple variables here. - # Note that the first, i.e., innermost loop counter is chosen for the array access for all dimensions! - el_matrix_element_index = [ - float_loop_ctr_arrays[d].absolute_access( - (element_index[0] - phantom_ctr,), (0,) - ) - for d in range(geometry.dimensions) - ] - - # Let's fill the array. - float_ctr_array_size = ( - 8 if Opts.VECTORIZE512 in integration_info.optimizations else 4 - ) - - custom_code = "" - custom_code += f"const int64_t phantom_ctr_0 = ctr_0;\n" - for d in range(geometry.dimensions): - array_name = ps.DEFAULTS.field_pointer_name( - self.symbolizer.float_loop_ctr_array(geometry.dimensions)[d].name - ) - custom_code += f"{self._type_descriptor.pystencils_type.c_string()} {array_name}[{float_ctr_array_size}];\n" - for i in range(float_ctr_array_size): - custom_code += f"{array_name}[{i}] = ({self._type_descriptor.pystencils_type.c_string()}) ctr_{d}" - if d == 0: - # We only vectorize the innermost loop. - # Only that counter is increased. The others are constant. - custom_code += f"+ {i}" - custom_code += ";\n" - - # We need the fields' symbols to signal that they have been defined inside the CustomCodeNode and do not - # end up in the kernel parameters. - float_loop_ctr_array_symbols = [ - FieldPointerSymbol(f.name, f.dtype, False) - for f in float_loop_ctr_arrays - ] - - # Creating our custom node. - loop_counter_custom_code_nodes = [ - CustomCodeNode( - custom_code, - # The (integer) loop counters. This makes sure the custom code node has to stay inside the loop. - # Without this, it would not depend on the counter. - element_index, - # We use the symbols_defined parameter to signal pystencils that the phantom_ctr and the loop - # counter arrays are being defined inside the custom code node and do not appear in the kernel - # parameters. - [phantom_ctr] + float_loop_ctr_array_symbols, - ) - ] - - return el_matrix_element_index, loop_counter_custom_code_nodes - def _generate_kernel( self, ctx: KernelCreationContext, @@ -1336,23 +1188,9 @@ class HyTeGElementwiseOperator: ) ) - # Compute coordinates of the micro-element that can safely be used for all optimizations. - # - # Those should not be used for the computation of the Jacobians from the reference to the affine space since - # we can also exploit translation invariance there. Here, we cannot exploit that since the following symbols - # are meant for other computations that have to be executed at quadrature points. - # - # See docstring of the called method for details. - ( - el_matrix_element_index, - loop_counter_custom_code_nodes, - ) = self._compute_micro_element_coordinates( - integration_info, element_index, geometry - ) - el_vertex_coordinates = element_vertex_coordinates( geometry, - el_matrix_element_index, + element_index, # type: ignore[arg-type] element_type, indexing_info.micro_edges_per_macro_edge_float, macro_vertex_coordinates, @@ -1426,7 +1264,7 @@ class HyTeGElementwiseOperator: freeze = FreezeExpressions(ctx) loop_bodies[element_type] = freeze( AssignmentCollection( - loop_counter_custom_code_nodes + coords_assignments + blending_assignments + load_vecs + coords_assignments + blending_assignments + load_vecs ) ) loop_bodies[element_type].statements += [ @@ -1499,16 +1337,19 @@ class HyTeGElementwiseOperator: "boundary integrals: setting unused reference coordinate to 0", logging.DEBUG, ): - block = substitute_symbols( - block, - { - ctx.find_symbol( - self.symbolizer.ref_coords_as_list(geometry.dimensions)[ - -1 - ].name - ): PsExpression.make(PsConstant(0, ctx.default_dtype)) - }, - ) + if ( + unused_ref_coord_symb := ctx.find_symbol( + self.symbolizer.ref_coords_as_list(geometry.dimensions)[-1].name + ) + ) is not None: + block = substitute_symbols( + block, + { + unused_ref_coord_symb: PsExpression.make( + PsConstant(0, ctx.default_dtype) + ) + }, + ) # Add quadrature points and weights array declarations, but only those # which are actually needed. @@ -1582,7 +1423,7 @@ class HyTeGElementwiseOperator: # TODO: remove unused symbols automatically with TimedLogger("Canonicalizing symbols", logging.DEBUG): canonicalize = CanonicalizeSymbols(ctx, True) - kernel = canonicalize(kernel) + kernel = cast(PsBlock, canonicalize(kernel)) # optimizer applies optimizations with TimedLogger( @@ -1595,37 +1436,58 @@ class HyTeGElementwiseOperator: if isinstance(kernel_wrapper_type.kernel_type, Assemble): optimizer = optimizer.copy_without_vectorization() - platform_dep_kernels.append( - optimizer.apply_to_kernel( - ctx, kernel, dim, integration_info.loop_strategy - ) + optimized_kernel_asts = optimizer.apply_to_kernel( + ctx, kernel, dim, integration_info.loop_strategy ) - first_x_loop = next( - dfs_preorder( - kernel, - lambda node: isinstance(node, PsLoop) - and node.counter.symbol.name - == DEFAULTS.spatial_counter_names[0], - ) + # Count operations in non-vectorized kernel, because flops + # have been converted to function calls (intrinsics) by + # the vectorizer. + scalar_ast = optimized_kernel_asts[0].ast + first_x_loop = cast( + PsLoop, + next( + dfs_preorder( + scalar_ast, + lambda node: isinstance(node, PsLoop) + and ctx.basename(node.counter.symbol) + == DEFAULTS.spatial_counter_names[0], + ) + ), ) op_counts = OperationCounter()(first_x_loop.body) d = vars(op_counts) - kernel_op_count = tabulate.tabulate([d.values()], headers=d.keys()) - - kernel_function = ps.codegen.driver.create_cpu_kernel_function( - ctx, - ps.backend.platforms.GenericCpu(ctx), - kernel, - function_name=kernel_name, - target_spec=Target.X86_AVX, - jit=ps.jit.no_jit, + kernel_op_count = tabulate.tabulate( + [d.values()], headers=list(d.keys()) ) - kernel_functions.append(kernel_function) + k_funcs = [ + ps.codegen.driver.create_cpu_kernel_function( + ctx, + opt_kernel.platform, + opt_kernel.ast, + function_name=kernel_name, + target_spec=opt_kernel.target, + jit=ps.jit.no_jit, + ) + for opt_kernel in optimized_kernel_asts + ] + + kernel_functions.append(k_funcs[-1]) kernel_op_counts.append(kernel_op_count) + platform_dep_kernels.append( + dict( + zip( + ( + opt_kernel.desc + for opt_kernel in optimized_kernel_asts + ), + k_funcs, + ) + ) + ) - # Setup kernel wrapper string and op count table + # Setup kernel wrapper string # # In the following, we insert the sub strings of the final kernel string: # coefficients (retrieving pointers), setup of scalar parameters, kernel function call @@ -1754,11 +1616,11 @@ class HyTeGElementwiseOperator: ) # Collect all information - kernel = OperatorMethod( + op_method = OperatorMethod( kernel_wrapper_type, kernel_functions, platform_dep_kernels, kernel_op_counts, integration_infos, ) - self.operator_methods.append(kernel) + self.operator_methods.append(op_method) diff --git a/hog/operator_generation/optimizer.py b/hog/operator_generation/optimizer.py index d209c49e71f3e9ae24a49fa543685605924e8378..64ae52f496adcecaff24818f6f789a2397fa23b2 100644 --- a/hog/operator_generation/optimizer.py +++ b/hog/operator_generation/optimizer.py @@ -15,24 +15,35 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. from copy import deepcopy +from dataclasses import dataclass import enum import logging +import re import sympy as sp -from typing import Dict, Iterable, List, Set, Any +from typing import cast, Dict, Iterable, List, Set, Any from pystencils import Kernel, TypedSymbol -from pystencils.backend.ast.expressions import PsCall, PsConstant, PsExpression +from pystencils.backend.ast.expressions import PsCall, PsExpression from pystencils.backend.ast.structural import PsAssignment, PsBlock, PsLoop +from pystencils.backend.constants import PsConstant from pystencils.backend.functions import MathFunctions, PsMathFunction from pystencils.backend.kernelcreation import KernelCreationContext +import pystencils.backend.platforms as platforms from pystencils.backend.transformations import ( CanonicalizeSymbols, EliminateBranches, EliminateConstants, HoistLoopInvariantDeclarations, + LoopVectorizer, LowerToC, ReshapeLoops, + SelectIntrinsics, ) +from pystencils.codegen.config import VectorizationConfig +from pystencils.codegen.target import Target +from pystencils.defaults import DEFAULTS +from pystencils.types import PsScalarType +from pystencils.types.quick import Fp from hog.ast import Operations, count_operations import hog.cse @@ -96,6 +107,14 @@ def ordered_opts_suffix(loop_strategy: LoopStrategy, opts: Iterable[Opts]) -> st return opts_suffix +@dataclass +class OptimizedAst: + ast: PsBlock + desc: str + target: Target + platform: platforms.Platform + + class Optimizer: def __init__(self, opts: Set[Opts]): self._opts = set(opts) @@ -136,26 +155,21 @@ class Optimizer: def apply_to_kernel( self, ctx: KernelCreationContext, - kernel: PsBlock, + kernel_ast: PsBlock, dim: int, loop_strategy: LoopStrategy, - ) -> Dict[str, Kernel]: + ) -> List[OptimizedAst]: """Applies optimizations to the given kernel function. - The optimizations are applied to the passed kernel which is modified - in place. Additionally, optimizations which require hardware support - (i.e. vectorization) are applied in isolation. This method returns a - dictionary mapping platform features to kernels. The kernels in this - dictionary only require the hardware feature specified by the dictionary - key. - For example, when vectorization (avx) is requested the returned dict looks like - { "noarch": (optimized kernel *without vectorization*) - , "avx" : (vectorized kernel) - }. - The "avx" kernel is the same Python object as the argument to this - function. + This method returns a list of optimized kernel implementations ordered + increasingly by hardware requirements. In particular, if vectorization + is requested, the returned list contains two elements: a scalar kernel + at index 0, and a vectorized kernel at index 1. The returned object + contains the AST as well as information about the required hardware. """ + # 1. platform independent optimizations + if isinstance(loop_strategy, CUBES): # We cut the innermost loop so that the conditionals can be # simplified/removed. In particular, we split the innermost loop @@ -166,7 +180,9 @@ class Optimizer: with TimedLogger("cutting loops", logging.DEBUG): reshape_loops = ReshapeLoops(ctx) - loops = [loop for loop in kernel.statements if isinstance(loop, PsLoop)] + loops = [ + loop for loop in kernel_ast.statements if isinstance(loop, PsLoop) + ] assert len(loops) == 1, f"Expecting a single loop here, not {loops}" loop = loops[0] assert ( @@ -190,121 +206,159 @@ class Optimizer: with TimedLogger("simplifying conditionals", logging.DEBUG): loop = EliminateBranches(ctx)(loop) + # TODO: Fold and extract constants + # elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True) + # kernel_ast = cast(PsBlock, elim_constants(kernel_ast)) + if self[Opts.MOVECONSTANTS]: with TimedLogger("moving constants out of loop", logging.DEBUG): hoist_invariants = HoistLoopInvariantDeclarations(ctx) - kernel.statements = hoist_invariants(kernel).statements + kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast)) - with TimedLogger("lowering", logging.DEBUG): - # Lowering - lower_to_c = LowerToC(ctx) - kernel = lower_to_c(kernel) + # 2. hardware specific optimizations - # Late canonicalization and constant elimination passes - # * Since lowering introduces new index calculations and indexing symbols into the AST, - # * these need to be handled here + noarch_kernel = OptimizedAst( + kernel_ast, "noarch", Target.GenericCPU, platforms.GenericCpu(ctx) + ) + optimized_kernels = [noarch_kernel] - canonicalize = CanonicalizeSymbols(ctx, True) - kernel = canonicalize(kernel) + if self[Opts.VECTORIZE]: + with TimedLogger("vectorizing loops", logging.DEBUG): + kernel_ast = kernel_ast.clone() - # TODO expose constant folding as another optimization pass/option - # late_fold_constants = EliminateConstants(ctx, extract_constant_exprs=False) - # kernel = late_fold_constants(kernel) + if self[Opts.VECTORIZE512]: + if ctx.default_dtype == Fp(16): + arch = platforms.X86VectorArch.AVX512_FP16 + target = Target.X86_AVX512_FP16 + else: + arch = platforms.X86VectorArch.AVX512 + target = Target.X86_AVX512 + else: + arch = platforms.X86VectorArch.AVX + target = Target.X86_AVX - # self.elimtmps_and_reorder(kernel_function) + desc = "avx512" if self[Opts.VECTORIZE512] else "avx" + platform = platforms.X86VectorCpu(ctx, arch) - platform_dependent_kernels = {"noarch": deepcopy(kernel)} + lanes = VectorizationConfig.default_lanes( + target, cast(PsScalarType, ctx.default_dtype) + ) + vectorizer = LoopVectorizer(ctx, lanes) - if self[Opts.VECTORIZE]: - instruction_set = "avx512" if self[Opts.VECTORIZE512] else "avx" + def loop_predicate(loop: PsLoop) -> bool: + return ( + ctx.basename(loop.counter.symbol) + == DEFAULTS.spatial_counter_names[0] + ) - with TimedLogger("vectorizing loops", logging.DEBUG): - vectorize( - kernel_function, - instruction_set=instruction_set, - replace_loops_with_length_one=False, + kernel_ast = vectorizer.vectorize_select_loops( + kernel_ast, loop_predicate ) - platform_dependent_kernels[instruction_set] = kernel_function - - return platform_dependent_kernels - - def reduce_reuse_dist(self, stmts: List[PsAssignment]) -> List[PsAssignment]: - # collect the accessed symbols of each statement (accessed on their rhs) in a dict - accessed_symbols = {} - for stmt in stmts: - accessed_symbols[stmt.lhs.name] = [ - s.name for s in stmt.rhs.atoms(TypedSymbol, sp.Symbol) - ] - - ctr = 0 - # we want to find a position close to the point where all its accessed symbols/dependencies are defined - while ctr < len(stmts) - 1: - # take the current statement - stmt = stmts.pop(ctr) - ctr2 = 0 - # move upwards in the statement list - while ctr - ctr2 >= 0: - # print("ctr, ctr2=(" + str(ctr) + "," + str(ctr2) + ")") - cur_pos = ctr - ctr2 - stmt2 = stmts[cur_pos] - # if the current stmt2 we compare with defines a symbol that is accessed by stmt we stop and insert at that position - if stmt2.lhs.name in accessed_symbols[stmt.lhs.name]: - stmts.insert(cur_pos + 1, stmt) - break - # if we are at the beginning of the list stmt is not depending on any definition in the kernel (except for the loop counter) - # and can be put at its beginning - if ctr - ctr2 == 0: - stmts.insert(0, stmt) - ctr2 += 1 - - ctr += 1 - return stmts - - def elimtmps_and_reorder(self, kernel_function: Kernel) -> None: - main_body = kernel_function.body - - # eliminate temporaries with only a small number of flops - if self[Opts.ELIMTMPS]: - with TimedLogger( - "inlining tmps with small rhs (< 4 mul/add)", logging.DEBUG - ): - - def eliminate(stmt, stmts, ctr): - tmp_symbol = deepcopy(stmt.lhs) - for stmt_inner in stmts[ctr + 1 :]: - if ( - # TODO do not simply ignore other AST nodes - isinstance(stmt_inner, PsAssignment) - and stmt_inner.lhs.name != tmp_symbol.name - ): - stmt_inner.subs({tmp_symbol: stmt.rhs}) - - ctr = 0 - stmts_outer = main_body.take_child_nodes() - for stmt_outer in stmts_outer: - if isinstance(stmt_outer, PsAssignment): - ops = Operations() - count_operations(stmt_outer.rhs, ops) - flops = vars(ops) - if ( - flops["adds"] + flops["muls"] < 4 - and flops["divs"] == 0 - and flops["pows"] == 0 - and flops["abs"] == 0 - ): - eliminate(stmt_outer, stmts_outer, ctr) - else: - main_body.append(stmt_outer) - else: - # TODO recurse into blocks, loop bodies, etc. - main_body.append(stmt_outer) - ctr += 1 - - if self[Opts.REORDER]: - with TimedLogger( - "reordering stmts to reduce locality of access for tmps", logging.DEBUG - ): - # TODO recurse into blocks, loop bodies, etc. - main_body = Block(self.reduce_reuse_dist(main_body.args)) - - kernel_function.body = main_body + + select_intrin = SelectIntrinsics( + ctx, platform, use_builtin_convertvector=True + ) + kernel_ast = cast(PsBlock, select_intrin(kernel_ast)) + + optimized_kernels.append( + OptimizedAst(kernel_ast, desc, target, platform) + ) + + with TimedLogger("lowering", logging.DEBUG): + # Lowering + lower_to_c = LowerToC(ctx) + # Late canonicalization pass: Canonicalize new symbols introduced by LowerToC + canonicalize = CanonicalizeSymbols(ctx, True) + + for opt_kernel in optimized_kernels: + opt_kernel.ast = cast(PsBlock, lower_to_c(opt_kernel.ast)) + opt_kernel.ast = cast(PsBlock, canonicalize(opt_kernel.ast)) + + # TODO + # select_functions = SelectFunctions(self._platform) + # kernel_ast = cast(PsBlock, select_functions(kernel_ast)) + + # self.elimtmps_and_reorder(kernel_function) + + return optimized_kernels + + # def reduce_reuse_dist(self, stmts: List[PsAssignment]) -> List[PsAssignment]: + # # collect the accessed symbols of each statement (accessed on their rhs) in a dict + # accessed_symbols = {} + # for stmt in stmts: + # accessed_symbols[stmt.lhs.name] = [ + # s.name for s in stmt.rhs.atoms(TypedSymbol, sp.Symbol) + # ] + + # ctr = 0 + # # we want to find a position close to the point where all its accessed symbols/dependencies are defined + # while ctr < len(stmts) - 1: + # # take the current statement + # stmt = stmts.pop(ctr) + # ctr2 = 0 + # # move upwards in the statement list + # while ctr - ctr2 >= 0: + # # print("ctr, ctr2=(" + str(ctr) + "," + str(ctr2) + ")") + # cur_pos = ctr - ctr2 + # stmt2 = stmts[cur_pos] + # # if the current stmt2 we compare with defines a symbol that is accessed by stmt we stop and insert at that position + # if stmt2.lhs.name in accessed_symbols[stmt.lhs.name]: + # stmts.insert(cur_pos + 1, stmt) + # break + # # if we are at the beginning of the list stmt is not depending on any definition in the kernel (except for the loop counter) + # # and can be put at its beginning + # if ctr - ctr2 == 0: + # stmts.insert(0, stmt) + # ctr2 += 1 + + # ctr += 1 + # return stmts + + # def elimtmps_and_reorder(self, kernel_function: Kernel) -> None: + # main_body = kernel_function.body + + # # eliminate temporaries with only a small number of flops + # if self[Opts.ELIMTMPS]: + # with TimedLogger( + # "inlining tmps with small rhs (< 4 mul/add)", logging.DEBUG + # ): + + # def eliminate(stmt, stmts, ctr): + # tmp_symbol = deepcopy(stmt.lhs) + # for stmt_inner in stmts[ctr + 1 :]: + # if ( + # # TODO do not simply ignore other AST nodes + # isinstance(stmt_inner, PsAssignment) + # and stmt_inner.lhs.name != tmp_symbol.name + # ): + # stmt_inner.subs({tmp_symbol: stmt.rhs}) + + # ctr = 0 + # stmts_outer = main_body.take_child_nodes() + # for stmt_outer in stmts_outer: + # if isinstance(stmt_outer, PsAssignment): + # ops = Operations() + # count_operations(stmt_outer.rhs, ops) + # flops = vars(ops) + # if ( + # flops["adds"] + flops["muls"] < 4 + # and flops["divs"] == 0 + # and flops["pows"] == 0 + # and flops["abs"] == 0 + # ): + # eliminate(stmt_outer, stmts_outer, ctr) + # else: + # main_body.append(stmt_outer) + # else: + # # TODO recurse into blocks, loop bodies, etc. + # main_body.append(stmt_outer) + # ctr += 1 + + # if self[Opts.REORDER]: + # with TimedLogger( + # "reordering stmts to reduce locality of access for tmps", logging.DEBUG + # ): + # # TODO recurse into blocks, loop bodies, etc. + # main_body = Block(self.reduce_reuse_dist(main_body.args)) + + # kernel_function.body = main_body diff --git a/hog/symbolizer.py b/hog/symbolizer.py index 48d3ad607d1c2a0b92059d19561b97ce45e9c706..79a78b14ffb678252ac9b43040d20f9fc9260431 100644 --- a/hog/symbolizer.py +++ b/hog/symbolizer.py @@ -44,7 +44,6 @@ class Symbolizer: symbol_jac_affine_inv="jac_affine_inv", symbol_abs_det_jac_affine="abs_det_jac_affine", symbol_blending_parameter_prefix="blending_param_", - float_loop_ctr_array_prefix="float_loop_ctr_array_dim_", symbol_jac_blending="jac_blending", symbol_jac_blending_inv="jac_blending_inv", symbol_abs_det_jac_blending="abs_det_jac_blending", @@ -79,7 +78,6 @@ class Symbolizer: self._symbol_jac_affine_inv = symbol_jac_affine_inv self._symbol_abs_det_jac_affine = symbol_abs_det_jac_affine self._symbol_blending_parameter_prefix = symbol_blending_parameter_prefix - self._float_loop_ctr_array_prefix = float_loop_ctr_array_prefix self._symbol_jac_blending = symbol_jac_blending self._symbol_jac_blending_inv = symbol_jac_blending_inv self._symbol_abs_det_jac_blending = symbol_abs_det_jac_blending @@ -258,8 +256,3 @@ class Symbolizer: + list(self.abs_det_jac_affine_to_blending().free_symbols) + list(sp.Array(self.hessian_blending_map(dimensions)).free_symbols) ) - - def float_loop_ctr_array(self, dimensions: int) -> List[sp.Symbol]: - return sp.symbols( - [f"{self._float_loop_ctr_array_prefix}{d}" for d in range(dimensions)] - ) diff --git a/pyproject.toml b/pyproject.toml index db5b4aea92c7b4e7dff859d6825bb7aa4efec701..2ceea140ffe053a1ba2d46842078de42d1a8565a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ dependencies = [ "numpy==1.24.3", "quadpy-gpl==0.16.10", "poly-cse-py", - "pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@c5cf38b125a569928c16f7bb2f0551ca7f3086d5", + "pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@4f8e42e6c2866dce3f1244c41ecdf45db896ec61", "pytest==7.3.1", "sympy==1.11.1", "tabulate==0.9.0", diff --git a/requirements.txt b/requirements.txt index f7100bbfc871efe05c2f8d63663cff830c70b4f1..ef70266f101448e6fb27deee76a51cc96656e1d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ islpy ndim @ https://github.com/sigma-py/ndim/archive/refs/tags/v0.1.6.tar.gz numpy==1.24.3 poly-cse-py -pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@c5cf38b125a569928c16f7bb2f0551ca7f3086d5 +pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@4f8e42e6c2866dce3f1244c41ecdf45db896ec61 pytest==7.3.1 sympy==1.11.1 tabulate==0.9.0