diff --git a/hog/operator_generation/operators.py b/hog/operator_generation/operators.py
index 11df36940a3ad78f28916e1e1ecce33b37db0522..3d9455bdf1f7a20e3a5dfc36fada84bffc76dfa2 100644
--- a/hog/operator_generation/operators.py
+++ b/hog/operator_generation/operators.py
@@ -14,13 +14,13 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import auto, Enum
 from functools import reduce
 import logging
 import os
 from textwrap import indent
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import cast, Dict, List, Optional, Set, Union
 
 import numpy as np
 import sympy as sp
@@ -29,12 +29,12 @@ import pystencils as ps
 
 from sympy.codegen.ast import Assignment
 
-from pystencils import AssignmentCollection, Kernel, Target, TypedSymbol
-from pystencils.backend.ast import PsAstNode
+from pystencils import AssignmentCollection, Kernel, Target
 from pystencils.backend.ast.analysis import OperationCounter, UndefinedSymbolsCollector
-from pystencils.backend.ast.expressions import PsConstant, PsExpression, PsSymbolExpr
+from pystencils.backend.ast.expressions import PsExpression
 from pystencils.backend.ast.iteration import dfs_preorder
 from pystencils.backend.ast.structural import PsBlock, PsLoop
+from pystencils.backend.constants import PsConstant
 from pystencils.backend.kernelcreation import (
     FreezeExpressions,
     KernelCreationContext,
@@ -71,10 +71,8 @@ from hog.operator_generation.function_space_implementations.function_space_impl_
 from hog.operator_generation.function_space_implementations.function_space_impl_factory import (
     create_impl,
 )
-from hog.operator_generation.pystencils_extensions import create_generic_fields
 
 from hog.integrand import Form
-from hog.ast import Operations
 from hog.blending import GeometryMap
 import hog.code_generation
 import hog.cse
@@ -914,152 +912,6 @@ class HyTeGElementwiseOperator:
 
         return operator_cpp_file
 
-    def _compute_micro_element_coordinates(
-        self,
-        integration_info: IntegrationInfo,
-        element_index: List[sp.Symbol],
-        geometry: ElementGeometry,
-    ):  # -> Tuple[List[int | sp.Symbol | Field.Access], List[CustomCodeNode]]:
-        """
-        Computes coordinates of the micro-element. This is _not_ to be confused with the coordinates of the element's
-        vertices!
-
-        We need to distinguish two purposes of the loop counters / element indices - they are used
-        (a) inside array accesses (i.e., for the computation of array indices - as integers)
-        (b) for the computation of coordinates (i.e., inside expressions - after being cast to a float type)
-
-        Although we already have the (integer) symbols of the element as an input parameter, this method returns symbols
-        of the element indices that can be used for _computations_ - as opposed to array accesses. That is, we sort out
-        case (b).
-
-        In short:
-        - use element_index for array accesses
-        - use what is returned from this method for computations that involve the element index (or loop counters
-          respectively, it is the same thing more or less)
-
-        Why? Optimizations are not straightforward otherwise (includes purely affine transformations and especially
-        vectorization whenever there are loop-counter dependent expressions).
-
-        :param integration_info: container that specifies various properties of the kernel
-        :param element_index: index of the micro-element, this is practically a list of symbols that should be equal to
-                              the loop counter symbols
-        :param geometry: the element's geometry
-        :returns: a tuple of the element's vertex coordinates to be used for computations, and a list of CustomCodeNodes
-                  that needs to be included at the beginning of the innermost loop's body
-        """
-
-        # This list is only filled if we want to vectorize.
-        loop_counter_custom_code_nodes = []
-
-        if (
-            Opts.VECTORIZE not in integration_info.optimizations
-            and Opts.VECTORIZE512 not in integration_info.optimizations
-        ):
-            # The Jacobians are loop-counter dependent, and we do not care about vectorization.
-            # So we just use the indices. pystencils will handle casting them to float.
-            el_matrix_element_index = element_index.copy()
-
-        else:
-            # Vectorization and loop-counter dependencies
-            #
-            # Easier said than done.
-            #
-            # At the time of writing this comment, pystencils` vectorizer has a problem with integers
-            # (or more general: with vectorizing loop counter-dependent expressions that are not array accesses).
-            #
-            # It follows a hack to circumvent this problem that can hopefully be removed as soon as pytencils`
-            # vectorizer is supporting such endeavors.
-            # Instead of directly vectorizing integer expressions, we use a CustomCodeNode to first cast the loop
-            # counters to the desired floating point format, and then write the cast loop counters to an array.
-            # The size of that array must be at least as large as the width of the respective vector data types
-            # (i.e. for AVX256 and 64bit counters it must be of length 4).
-            # Since the counters are now in an array (or more specifically a pystencils.Field data structure)
-            # the vectorizer just treats the (float) counters as any other data and can apply vectorization.
-            #
-            # For this to work, the array access (to the array that contains the cast loop counters) must contain
-            # the loop counter (otherwise it may be interpreted as a constant and moved in front of the loop).
-            # However, we only want the array to be of size <width-of-vector-datatype>. If we use the loop
-            # counter to access the array, at first glance one would think it needs to be as large as the
-            # iteration space. To avoid that, this hack introduces a "phantom" counter that is just set to the
-            # same value as the loop counter, and we subtract it from the access. We define that thing inside a
-            # CustomCodeNode, so it cannot be evaluated by sympy.
-            #
-            # Later we add the pystencils.Fields that we introduce here to the "global variables" of the
-            # Kernel object such that they do not end up in the kernel parameters.
-            #
-            # Whenever the loop counters are not part of expressions at all (e.g., when the Jacobians are
-            # constant on the entire macro) or when we do not vectorize, none of this is necessary.
-            #
-            # Phew.
-
-            # Those are the fields that hold the loop counters.
-            # We need one for each loop (i.e., one for each dimension).
-            float_loop_ctr_arrays = create_generic_fields(
-                [
-                    s.name
-                    for s in self.symbolizer.float_loop_ctr_array(geometry.dimensions)
-                ],
-                self._type_descriptor.pystencils_type,
-            )
-
-            # This is just some symbol that we set to the innermost loop ctr.
-            # However, we make sure sympy never sees this.
-            phantom_ctr = TypedSymbol("phantom_ctr_0", int)
-
-            # Those are the actual counters (i.e. the coordinates of the micro-element) for use in expressions that
-            # are _not_ array accesses. We assign them to the first entry of the float loop counter array. The
-            # vectorizer can automatically assign multiple variables here.
-            # Note that the first, i.e., innermost loop counter is chosen for the array access for all dimensions!
-            el_matrix_element_index = [
-                float_loop_ctr_arrays[d].absolute_access(
-                    (element_index[0] - phantom_ctr,), (0,)
-                )
-                for d in range(geometry.dimensions)
-            ]
-
-            # Let's fill the array.
-            float_ctr_array_size = (
-                8 if Opts.VECTORIZE512 in integration_info.optimizations else 4
-            )
-
-            custom_code = ""
-            custom_code += f"const int64_t phantom_ctr_0 = ctr_0;\n"
-            for d in range(geometry.dimensions):
-                array_name = ps.DEFAULTS.field_pointer_name(
-                    self.symbolizer.float_loop_ctr_array(geometry.dimensions)[d].name
-                )
-                custom_code += f"{self._type_descriptor.pystencils_type.c_string()} {array_name}[{float_ctr_array_size}];\n"
-                for i in range(float_ctr_array_size):
-                    custom_code += f"{array_name}[{i}] = ({self._type_descriptor.pystencils_type.c_string()}) ctr_{d}"
-                    if d == 0:
-                        # We only vectorize the innermost loop.
-                        # Only that counter is increased. The others are constant.
-                        custom_code += f"+ {i}"
-                    custom_code += ";\n"
-
-            # We need the fields' symbols to signal that they have been defined inside the CustomCodeNode and do not
-            # end up in the kernel parameters.
-            float_loop_ctr_array_symbols = [
-                FieldPointerSymbol(f.name, f.dtype, False)
-                for f in float_loop_ctr_arrays
-            ]
-
-            # Creating our custom node.
-            loop_counter_custom_code_nodes = [
-                CustomCodeNode(
-                    custom_code,
-                    # The (integer) loop counters. This makes sure the custom code node has to stay inside the loop.
-                    # Without this, it would not depend on the counter.
-                    element_index,
-                    # We use the symbols_defined parameter to signal pystencils that the phantom_ctr and the loop
-                    # counter arrays are being defined inside the custom code node and do not appear in the kernel
-                    # parameters.
-                    [phantom_ctr] + float_loop_ctr_array_symbols,
-                )
-            ]
-
-        return el_matrix_element_index, loop_counter_custom_code_nodes
-
     def _generate_kernel(
         self,
         ctx: KernelCreationContext,
@@ -1336,23 +1188,9 @@ class HyTeGElementwiseOperator:
                     )
                 )
 
-            # Compute coordinates of the micro-element that can safely be used for all optimizations.
-            #
-            # Those should not be used for the computation of the Jacobians from the reference to the affine space since
-            # we can also exploit translation invariance there. Here, we cannot exploit that since the following symbols
-            # are meant for other computations that have to be executed at quadrature points.
-            #
-            # See docstring of the called method for details.
-            (
-                el_matrix_element_index,
-                loop_counter_custom_code_nodes,
-            ) = self._compute_micro_element_coordinates(
-                integration_info, element_index, geometry
-            )
-
             el_vertex_coordinates = element_vertex_coordinates(
                 geometry,
-                el_matrix_element_index,
+                element_index,  # type: ignore[arg-type]
                 element_type,
                 indexing_info.micro_edges_per_macro_edge_float,
                 macro_vertex_coordinates,
@@ -1426,7 +1264,7 @@ class HyTeGElementwiseOperator:
             freeze = FreezeExpressions(ctx)
             loop_bodies[element_type] = freeze(
                 AssignmentCollection(
-                    loop_counter_custom_code_nodes + coords_assignments + blending_assignments + load_vecs
+                    coords_assignments + blending_assignments + load_vecs
                 )
             )
             loop_bodies[element_type].statements += [
@@ -1499,16 +1337,19 @@ class HyTeGElementwiseOperator:
                 "boundary integrals: setting unused reference coordinate to 0",
                 logging.DEBUG,
             ):
-                block = substitute_symbols(
-                    block,
-                    {
-                        ctx.find_symbol(
-                            self.symbolizer.ref_coords_as_list(geometry.dimensions)[
-                                -1
-                            ].name
-                        ): PsExpression.make(PsConstant(0, ctx.default_dtype))
-                    },
-                )
+                if (
+                    unused_ref_coord_symb := ctx.find_symbol(
+                        self.symbolizer.ref_coords_as_list(geometry.dimensions)[-1].name
+                    )
+                ) is not None:
+                    block = substitute_symbols(
+                        block,
+                        {
+                            unused_ref_coord_symb: PsExpression.make(
+                                PsConstant(0, ctx.default_dtype)
+                            )
+                        },
+                    )
 
         # Add quadrature points and weights array declarations, but only those
         # which are actually needed.
@@ -1582,7 +1423,7 @@ class HyTeGElementwiseOperator:
                     # TODO: remove unused symbols automatically
                     with TimedLogger("Canonicalizing symbols", logging.DEBUG):
                         canonicalize = CanonicalizeSymbols(ctx, True)
-                        kernel = canonicalize(kernel)
+                        kernel = cast(PsBlock, canonicalize(kernel))
 
                     # optimizer applies optimizations
                     with TimedLogger(
@@ -1595,37 +1436,58 @@ class HyTeGElementwiseOperator:
                         if isinstance(kernel_wrapper_type.kernel_type, Assemble):
                             optimizer = optimizer.copy_without_vectorization()
 
-                        platform_dep_kernels.append(
-                            optimizer.apply_to_kernel(
-                                ctx, kernel, dim, integration_info.loop_strategy
-                            )
+                        optimized_kernel_asts = optimizer.apply_to_kernel(
+                            ctx, kernel, dim, integration_info.loop_strategy
                         )
 
-                    first_x_loop = next(
-                        dfs_preorder(
-                            kernel,
-                            lambda node: isinstance(node, PsLoop)
-                            and node.counter.symbol.name
-                            == DEFAULTS.spatial_counter_names[0],
-                        )
+                    # Count operations in non-vectorized kernel, because flops
+                    # have been converted to function calls (intrinsics) by
+                    # the vectorizer.
+                    scalar_ast = optimized_kernel_asts[0].ast
+                    first_x_loop = cast(
+                        PsLoop,
+                        next(
+                            dfs_preorder(
+                                scalar_ast,
+                                lambda node: isinstance(node, PsLoop)
+                                and ctx.basename(node.counter.symbol)
+                                == DEFAULTS.spatial_counter_names[0],
+                            )
+                        ),
                     )
                     op_counts = OperationCounter()(first_x_loop.body)
                     d = vars(op_counts)
-                    kernel_op_count = tabulate.tabulate([d.values()], headers=d.keys())
-
-                    kernel_function = ps.codegen.driver.create_cpu_kernel_function(
-                        ctx,
-                        ps.backend.platforms.GenericCpu(ctx),
-                        kernel,
-                        function_name=kernel_name,
-                        target_spec=Target.X86_AVX,
-                        jit=ps.jit.no_jit,
+                    kernel_op_count = tabulate.tabulate(
+                        [d.values()], headers=list(d.keys())
                     )
 
-                    kernel_functions.append(kernel_function)
+                    k_funcs = [
+                        ps.codegen.driver.create_cpu_kernel_function(
+                            ctx,
+                            opt_kernel.platform,
+                            opt_kernel.ast,
+                            function_name=kernel_name,
+                            target_spec=opt_kernel.target,
+                            jit=ps.jit.no_jit,
+                        )
+                        for opt_kernel in optimized_kernel_asts
+                    ]
+
+                    kernel_functions.append(k_funcs[-1])
                     kernel_op_counts.append(kernel_op_count)
+                    platform_dep_kernels.append(
+                        dict(
+                            zip(
+                                (
+                                    opt_kernel.desc
+                                    for opt_kernel in optimized_kernel_asts
+                                ),
+                                k_funcs,
+                            )
+                        )
+                    )
 
-                # Setup kernel wrapper string and op count table
+                # Setup kernel wrapper string
                 #
                 # In the following, we insert the sub strings of the final kernel string:
                 # coefficients (retrieving pointers), setup of scalar parameters, kernel function call
@@ -1754,11 +1616,11 @@ class HyTeGElementwiseOperator:
                 )
 
                 # Collect all information
-                kernel = OperatorMethod(
+                op_method = OperatorMethod(
                     kernel_wrapper_type,
                     kernel_functions,
                     platform_dep_kernels,
                     kernel_op_counts,
                     integration_infos,
                 )
-                self.operator_methods.append(kernel)
+                self.operator_methods.append(op_method)
diff --git a/hog/operator_generation/optimizer.py b/hog/operator_generation/optimizer.py
index d209c49e71f3e9ae24a49fa543685605924e8378..64ae52f496adcecaff24818f6f789a2397fa23b2 100644
--- a/hog/operator_generation/optimizer.py
+++ b/hog/operator_generation/optimizer.py
@@ -15,24 +15,35 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 from copy import deepcopy
+from dataclasses import dataclass
 import enum
 import logging
+import re
 import sympy as sp
-from typing import Dict, Iterable, List, Set, Any
+from typing import cast, Dict, Iterable, List, Set, Any
 
 from pystencils import Kernel, TypedSymbol
-from pystencils.backend.ast.expressions import PsCall, PsConstant, PsExpression
+from pystencils.backend.ast.expressions import PsCall, PsExpression
 from pystencils.backend.ast.structural import PsAssignment, PsBlock, PsLoop
+from pystencils.backend.constants import PsConstant
 from pystencils.backend.functions import MathFunctions, PsMathFunction
 from pystencils.backend.kernelcreation import KernelCreationContext
+import pystencils.backend.platforms as platforms
 from pystencils.backend.transformations import (
     CanonicalizeSymbols,
     EliminateBranches,
     EliminateConstants,
     HoistLoopInvariantDeclarations,
+    LoopVectorizer,
     LowerToC,
     ReshapeLoops,
+    SelectIntrinsics,
 )
+from pystencils.codegen.config import VectorizationConfig
+from pystencils.codegen.target import Target
+from pystencils.defaults import DEFAULTS
+from pystencils.types import PsScalarType
+from pystencils.types.quick import Fp
 
 from hog.ast import Operations, count_operations
 import hog.cse
@@ -96,6 +107,14 @@ def ordered_opts_suffix(loop_strategy: LoopStrategy, opts: Iterable[Opts]) -> st
     return opts_suffix
 
 
+@dataclass
+class OptimizedAst:
+    ast: PsBlock
+    desc: str
+    target: Target
+    platform: platforms.Platform
+
+
 class Optimizer:
     def __init__(self, opts: Set[Opts]):
         self._opts = set(opts)
@@ -136,26 +155,21 @@ class Optimizer:
     def apply_to_kernel(
         self,
         ctx: KernelCreationContext,
-        kernel: PsBlock,
+        kernel_ast: PsBlock,
         dim: int,
         loop_strategy: LoopStrategy,
-    ) -> Dict[str, Kernel]:
+    ) -> List[OptimizedAst]:
         """Applies optimizations to the given kernel function.
 
-        The optimizations are applied to the passed kernel which is modified
-        in place. Additionally, optimizations which require hardware support
-        (i.e. vectorization) are applied in isolation. This method returns a
-        dictionary mapping platform features to kernels. The kernels in this
-        dictionary only require the hardware feature specified by the dictionary
-        key.
-        For example, when vectorization (avx) is requested the returned dict looks like
-            { "noarch": (optimized kernel *without vectorization*)
-            , "avx"   : (vectorized kernel)
-            }.
-        The "avx" kernel is the same Python object as the argument to this
-        function.
+        This method returns a list of optimized kernel implementations ordered
+        increasingly by hardware requirements. In particular, if vectorization
+        is requested, the returned list contains two elements: a scalar kernel
+        at index 0, and a vectorized kernel at index 1. The returned object
+        contains the AST as well as information about the required hardware.
         """
 
+        # 1. platform independent optimizations
+
         if isinstance(loop_strategy, CUBES):
             # We cut the innermost loop so that the conditionals can be
             # simplified/removed. In particular, we split the innermost loop
@@ -166,7 +180,9 @@ class Optimizer:
             with TimedLogger("cutting loops", logging.DEBUG):
                 reshape_loops = ReshapeLoops(ctx)
 
-                loops = [loop for loop in kernel.statements if isinstance(loop, PsLoop)]
+                loops = [
+                    loop for loop in kernel_ast.statements if isinstance(loop, PsLoop)
+                ]
                 assert len(loops) == 1, f"Expecting a single loop here, not {loops}"
                 loop = loops[0]
                 assert (
@@ -190,121 +206,159 @@ class Optimizer:
             with TimedLogger("simplifying conditionals", logging.DEBUG):
                 loop = EliminateBranches(ctx)(loop)
 
+        # TODO: Fold and extract constants
+        # elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
+        # kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
+
         if self[Opts.MOVECONSTANTS]:
             with TimedLogger("moving constants out of loop", logging.DEBUG):
                 hoist_invariants = HoistLoopInvariantDeclarations(ctx)
-                kernel.statements = hoist_invariants(kernel).statements
+                kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
 
-        with TimedLogger("lowering", logging.DEBUG):
-            # Lowering
-            lower_to_c = LowerToC(ctx)
-            kernel = lower_to_c(kernel)
+        # 2. hardware specific optimizations
 
-            # Late canonicalization and constant elimination passes
-            #  * Since lowering introduces new index calculations and indexing symbols into the AST,
-            #  * these need to be handled here
+        noarch_kernel = OptimizedAst(
+            kernel_ast, "noarch", Target.GenericCPU, platforms.GenericCpu(ctx)
+        )
+        optimized_kernels = [noarch_kernel]
 
-            canonicalize = CanonicalizeSymbols(ctx, True)
-            kernel = canonicalize(kernel)
+        if self[Opts.VECTORIZE]:
+            with TimedLogger("vectorizing loops", logging.DEBUG):
+                kernel_ast = kernel_ast.clone()
 
-            # TODO expose constant folding as another optimization pass/option
-            # late_fold_constants = EliminateConstants(ctx, extract_constant_exprs=False)
-            # kernel = late_fold_constants(kernel)
+                if self[Opts.VECTORIZE512]:
+                    if ctx.default_dtype == Fp(16):
+                        arch = platforms.X86VectorArch.AVX512_FP16
+                        target = Target.X86_AVX512_FP16
+                    else:
+                        arch = platforms.X86VectorArch.AVX512
+                        target = Target.X86_AVX512
+                else:
+                    arch = platforms.X86VectorArch.AVX
+                    target = Target.X86_AVX
 
-        # self.elimtmps_and_reorder(kernel_function)
+                desc = "avx512" if self[Opts.VECTORIZE512] else "avx"
+                platform = platforms.X86VectorCpu(ctx, arch)
 
-        platform_dependent_kernels = {"noarch": deepcopy(kernel)}
+                lanes = VectorizationConfig.default_lanes(
+                    target, cast(PsScalarType, ctx.default_dtype)
+                )
+                vectorizer = LoopVectorizer(ctx, lanes)
 
-        if self[Opts.VECTORIZE]:
-            instruction_set = "avx512" if self[Opts.VECTORIZE512] else "avx"
+                def loop_predicate(loop: PsLoop) -> bool:
+                    return (
+                        ctx.basename(loop.counter.symbol)
+                        == DEFAULTS.spatial_counter_names[0]
+                    )
 
-            with TimedLogger("vectorizing loops", logging.DEBUG):
-                vectorize(
-                    kernel_function,
-                    instruction_set=instruction_set,
-                    replace_loops_with_length_one=False,
+                kernel_ast = vectorizer.vectorize_select_loops(
+                    kernel_ast, loop_predicate
                 )
-            platform_dependent_kernels[instruction_set] = kernel_function
-
-        return platform_dependent_kernels
-
-    def reduce_reuse_dist(self, stmts: List[PsAssignment]) -> List[PsAssignment]:
-        # collect the accessed symbols of each statement (accessed on their rhs) in a dict
-        accessed_symbols = {}
-        for stmt in stmts:
-            accessed_symbols[stmt.lhs.name] = [
-                s.name for s in stmt.rhs.atoms(TypedSymbol, sp.Symbol)
-            ]
-
-        ctr = 0
-        # we want to find a position close to the point where all its accessed symbols/dependencies are defined
-        while ctr < len(stmts) - 1:
-            # take the current statement
-            stmt = stmts.pop(ctr)
-            ctr2 = 0
-            # move upwards in the statement list
-            while ctr - ctr2 >= 0:
-                #  print("ctr, ctr2=(" + str(ctr) + "," + str(ctr2) + ")")
-                cur_pos = ctr - ctr2
-                stmt2 = stmts[cur_pos]
-                # if the current stmt2 we compare with defines a symbol that is accessed by stmt we stop and insert at that position
-                if stmt2.lhs.name in accessed_symbols[stmt.lhs.name]:
-                    stmts.insert(cur_pos + 1, stmt)
-                    break
-                # if we are at the beginning of the list stmt is not depending on any definition in the kernel (except for the loop counter)
-                # and can be put at its beginning
-                if ctr - ctr2 == 0:
-                    stmts.insert(0, stmt)
-                ctr2 += 1
-
-            ctr += 1
-        return stmts
-
-    def elimtmps_and_reorder(self, kernel_function: Kernel) -> None:
-        main_body = kernel_function.body
-
-        # eliminate temporaries with only a small number of flops
-        if self[Opts.ELIMTMPS]:
-            with TimedLogger(
-                "inlining tmps with small rhs (< 4 mul/add)", logging.DEBUG
-            ):
-
-                def eliminate(stmt, stmts, ctr):
-                    tmp_symbol = deepcopy(stmt.lhs)
-                    for stmt_inner in stmts[ctr + 1 :]:
-                        if (
-                            # TODO do not simply ignore other AST nodes
-                            isinstance(stmt_inner, PsAssignment)
-                            and stmt_inner.lhs.name != tmp_symbol.name
-                        ):
-                            stmt_inner.subs({tmp_symbol: stmt.rhs})
-
-                ctr = 0
-                stmts_outer = main_body.take_child_nodes()
-                for stmt_outer in stmts_outer:
-                    if isinstance(stmt_outer, PsAssignment):
-                        ops = Operations()
-                        count_operations(stmt_outer.rhs, ops)
-                        flops = vars(ops)
-                        if (
-                            flops["adds"] + flops["muls"] < 4
-                            and flops["divs"] == 0
-                            and flops["pows"] == 0
-                            and flops["abs"] == 0
-                        ):
-                            eliminate(stmt_outer, stmts_outer, ctr)
-                        else:
-                            main_body.append(stmt_outer)
-                    else:
-                        # TODO recurse into blocks, loop bodies, etc.
-                        main_body.append(stmt_outer)
-                    ctr += 1
-
-        if self[Opts.REORDER]:
-            with TimedLogger(
-                "reordering stmts to reduce locality of access for tmps", logging.DEBUG
-            ):
-                # TODO recurse into blocks, loop bodies, etc.
-                main_body = Block(self.reduce_reuse_dist(main_body.args))
-
-        kernel_function.body = main_body
+
+                select_intrin = SelectIntrinsics(
+                    ctx, platform, use_builtin_convertvector=True
+                )
+                kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
+
+                optimized_kernels.append(
+                    OptimizedAst(kernel_ast, desc, target, platform)
+                )
+
+        with TimedLogger("lowering", logging.DEBUG):
+            # Lowering
+            lower_to_c = LowerToC(ctx)
+            # Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
+            canonicalize = CanonicalizeSymbols(ctx, True)
+
+            for opt_kernel in optimized_kernels:
+                opt_kernel.ast = cast(PsBlock, lower_to_c(opt_kernel.ast))
+                opt_kernel.ast = cast(PsBlock, canonicalize(opt_kernel.ast))
+
+            # TODO
+            # select_functions = SelectFunctions(self._platform)
+            # kernel_ast = cast(PsBlock, select_functions(kernel_ast))
+
+        # self.elimtmps_and_reorder(kernel_function)
+
+        return optimized_kernels
+
+    # def reduce_reuse_dist(self, stmts: List[PsAssignment]) -> List[PsAssignment]:
+    #     # collect the accessed symbols of each statement (accessed on their rhs) in a dict
+    #     accessed_symbols = {}
+    #     for stmt in stmts:
+    #         accessed_symbols[stmt.lhs.name] = [
+    #             s.name for s in stmt.rhs.atoms(TypedSymbol, sp.Symbol)
+    #         ]
+
+    #     ctr = 0
+    #     # we want to find a position close to the point where all its accessed symbols/dependencies are defined
+    #     while ctr < len(stmts) - 1:
+    #         # take the current statement
+    #         stmt = stmts.pop(ctr)
+    #         ctr2 = 0
+    #         # move upwards in the statement list
+    #         while ctr - ctr2 >= 0:
+    #             #  print("ctr, ctr2=(" + str(ctr) + "," + str(ctr2) + ")")
+    #             cur_pos = ctr - ctr2
+    #             stmt2 = stmts[cur_pos]
+    #             # if the current stmt2 we compare with defines a symbol that is accessed by stmt we stop and insert at that position
+    #             if stmt2.lhs.name in accessed_symbols[stmt.lhs.name]:
+    #                 stmts.insert(cur_pos + 1, stmt)
+    #                 break
+    #             # if we are at the beginning of the list stmt is not depending on any definition in the kernel (except for the loop counter)
+    #             # and can be put at its beginning
+    #             if ctr - ctr2 == 0:
+    #                 stmts.insert(0, stmt)
+    #             ctr2 += 1
+
+    #         ctr += 1
+    #     return stmts
+
+    # def elimtmps_and_reorder(self, kernel_function: Kernel) -> None:
+    #     main_body = kernel_function.body
+
+    #     # eliminate temporaries with only a small number of flops
+    #     if self[Opts.ELIMTMPS]:
+    #         with TimedLogger(
+    #             "inlining tmps with small rhs (< 4 mul/add)", logging.DEBUG
+    #         ):
+
+    #             def eliminate(stmt, stmts, ctr):
+    #                 tmp_symbol = deepcopy(stmt.lhs)
+    #                 for stmt_inner in stmts[ctr + 1 :]:
+    #                     if (
+    #                         # TODO do not simply ignore other AST nodes
+    #                         isinstance(stmt_inner, PsAssignment)
+    #                         and stmt_inner.lhs.name != tmp_symbol.name
+    #                     ):
+    #                         stmt_inner.subs({tmp_symbol: stmt.rhs})
+
+    #             ctr = 0
+    #             stmts_outer = main_body.take_child_nodes()
+    #             for stmt_outer in stmts_outer:
+    #                 if isinstance(stmt_outer, PsAssignment):
+    #                     ops = Operations()
+    #                     count_operations(stmt_outer.rhs, ops)
+    #                     flops = vars(ops)
+    #                     if (
+    #                         flops["adds"] + flops["muls"] < 4
+    #                         and flops["divs"] == 0
+    #                         and flops["pows"] == 0
+    #                         and flops["abs"] == 0
+    #                     ):
+    #                         eliminate(stmt_outer, stmts_outer, ctr)
+    #                     else:
+    #                         main_body.append(stmt_outer)
+    #                 else:
+    #                     # TODO recurse into blocks, loop bodies, etc.
+    #                     main_body.append(stmt_outer)
+    #                 ctr += 1
+
+    #     if self[Opts.REORDER]:
+    #         with TimedLogger(
+    #             "reordering stmts to reduce locality of access for tmps", logging.DEBUG
+    #         ):
+    #             # TODO recurse into blocks, loop bodies, etc.
+    #             main_body = Block(self.reduce_reuse_dist(main_body.args))
+
+    #     kernel_function.body = main_body
diff --git a/hog/symbolizer.py b/hog/symbolizer.py
index 48d3ad607d1c2a0b92059d19561b97ce45e9c706..79a78b14ffb678252ac9b43040d20f9fc9260431 100644
--- a/hog/symbolizer.py
+++ b/hog/symbolizer.py
@@ -44,7 +44,6 @@ class Symbolizer:
         symbol_jac_affine_inv="jac_affine_inv",
         symbol_abs_det_jac_affine="abs_det_jac_affine",
         symbol_blending_parameter_prefix="blending_param_",
-        float_loop_ctr_array_prefix="float_loop_ctr_array_dim_",
         symbol_jac_blending="jac_blending",
         symbol_jac_blending_inv="jac_blending_inv",
         symbol_abs_det_jac_blending="abs_det_jac_blending",
@@ -79,7 +78,6 @@ class Symbolizer:
         self._symbol_jac_affine_inv = symbol_jac_affine_inv
         self._symbol_abs_det_jac_affine = symbol_abs_det_jac_affine
         self._symbol_blending_parameter_prefix = symbol_blending_parameter_prefix
-        self._float_loop_ctr_array_prefix = float_loop_ctr_array_prefix
         self._symbol_jac_blending = symbol_jac_blending
         self._symbol_jac_blending_inv = symbol_jac_blending_inv
         self._symbol_abs_det_jac_blending = symbol_abs_det_jac_blending
@@ -258,8 +256,3 @@ class Symbolizer:
             + list(self.abs_det_jac_affine_to_blending().free_symbols)
             + list(sp.Array(self.hessian_blending_map(dimensions)).free_symbols)
         )
-
-    def float_loop_ctr_array(self, dimensions: int) -> List[sp.Symbol]:
-        return sp.symbols(
-            [f"{self._float_loop_ctr_array_prefix}{d}" for d in range(dimensions)]
-        )
diff --git a/pyproject.toml b/pyproject.toml
index db5b4aea92c7b4e7dff859d6825bb7aa4efec701..2ceea140ffe053a1ba2d46842078de42d1a8565a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ dependencies = [
   "numpy==1.24.3",
   "quadpy-gpl==0.16.10",
   "poly-cse-py",
-  "pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@c5cf38b125a569928c16f7bb2f0551ca7f3086d5",
+  "pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@4f8e42e6c2866dce3f1244c41ecdf45db896ec61",
   "pytest==7.3.1",
   "sympy==1.11.1",
   "tabulate==0.9.0",
diff --git a/requirements.txt b/requirements.txt
index f7100bbfc871efe05c2f8d63663cff830c70b4f1..ef70266f101448e6fb27deee76a51cc96656e1d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ islpy
 ndim @ https://github.com/sigma-py/ndim/archive/refs/tags/v0.1.6.tar.gz
 numpy==1.24.3
 poly-cse-py
-pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@c5cf38b125a569928c16f7bb2f0551ca7f3086d5
+pystencils @ git+https://i10git.cs.fau.de/pycodegen/pystencils.git@4f8e42e6c2866dce3f1244c41ecdf45db896ec61
 pytest==7.3.1
 sympy==1.11.1
 tabulate==0.9.0