diff --git a/src/pystencils/backend/ast/expressions.py b/src/pystencils/backend/ast/expressions.py
index a850470ffacf4f528ca5883e5d91b14fa6aa5f9c..167d732c7c9aaa0560798aef5242b9a9eb4e511b 100644
--- a/src/pystencils/backend/ast/expressions.py
+++ b/src/pystencils/backend/ast/expressions.py
@@ -51,7 +51,7 @@ class PsExpression(PsAstNode, ABC):
 
     def get_dtype(self) -> PsType:
         if self._dtype is None:
-            raise PsInternalCompilerError("No dtype set on this expression yet.")
+            raise PsInternalCompilerError(f"No data type set on expression {self}.")
 
         return self._dtype
 
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index f6d9f0993b9e42c24869b30f14101deaab7f6052..3e74e89282b44011295c742d96b35ce63401bc67 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -78,6 +78,28 @@ class X86VectorArch(Enum):
                 )
 
         return suffix
+    
+    def intrin_type(self, vtype: PsVectorType):
+        scalar_type = vtype.scalar_type
+        match scalar_type:
+            case Fp(16) if self >= X86VectorArch.AVX512:
+                suffix = "h"
+            case Fp(32):
+                suffix = ""
+            case Fp(64):
+                suffix = "d"
+            case SInt(_):
+                suffix = "i"
+            case _:
+                raise MaterializationError(
+                    f"x86/{self} does not support scalar type {scalar_type}"
+                )
+
+        if vtype.width > self.max_vector_width:
+            raise MaterializationError(
+                f"x86/{self} does not support {vtype}"
+            )
+        return PsCustomType(f"__m{vtype.width}{suffix}")
 
 
 class X86VectorCpu(GenericVectorCpu):
@@ -113,26 +135,7 @@ class X86VectorCpu(GenericVectorCpu):
         return super().required_headers | headers
 
     def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
-        scalar_type = vector_type.scalar_type
-        match scalar_type:
-            case Fp(16) if self._vector_arch >= X86VectorArch.AVX512:
-                suffix = "h"
-            case Fp(32):
-                suffix = ""
-            case Fp(64):
-                suffix = "d"
-            case SInt(_):
-                suffix = "i"
-            case _:
-                raise MaterializationError(
-                    f"x86/{self._vector_arch} does not support scalar type {scalar_type}"
-                )
-
-        if vector_type.width > self._vector_arch.max_vector_width:
-            raise MaterializationError(
-                f"x86/{self._vector_arch} does not support {vector_type}"
-            )
-        return PsCustomType(f"__m{vector_type.width}{suffix}")
+        return self._vector_arch.intrin_type(vector_type)
 
     def constant_intrinsic(self, c: PsConstant) -> PsExpression:
         vtype = c.dtype
@@ -214,12 +217,14 @@ def _x86_op_intrin(
 ) -> CFunction:
     prefix = varch.intrin_prefix(vtype)
     suffix = varch.intrin_suffix(vtype)
+    rtype = atype = varch.intrin_type(vtype)
 
     match op:
         case PsVecBroadcast():
             opstr = "set1"
             if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
-                suffix += "x"
+                suffix += "x"   
+            atype = vtype.scalar_type
         case PsAdd():
             opstr = "add"
         case PsSub():
@@ -238,4 +243,4 @@ def _x86_op_intrin(
             raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}")
 
     num_args = 1 if isinstance(op, PsUnOp) else 2
-    return CFunction(f"{prefix}_{opstr}_{suffix}", (vtype,) * num_args, vtype)
+    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
diff --git a/src/pystencils/backend/transformations/lower_to_c.py b/src/pystencils/backend/transformations/lower_to_c.py
index 0576616f2f2989ae72887f5cd720f263017a6b0a..62183fdf7f54e295b7ec459026f248b4018c4546 100644
--- a/src/pystencils/backend/transformations/lower_to_c.py
+++ b/src/pystencils/backend/transformations/lower_to_c.py
@@ -18,7 +18,7 @@ from ..ast.expressions import (
     PsCast,
     PsSymbolExpr,
 )
-from ...types import PsStructType, PsPointerType, PsUnsignedIntegerType
+from ...types import PsType, PsStructType, PsPointerType, PsUnsignedIntegerType
 
 
 class LowerToC:
@@ -37,9 +37,13 @@ class LowerToC:
 
     def __init__(self, ctx: KernelCreationContext) -> None:
         self._ctx = ctx
+        self._substitutions: dict[PsSymbol, PsSymbol] = dict()
+
         self._typify = Typifier(ctx)
 
-        self._substitutions: dict[PsSymbol, PsSymbol] = dict()
+        from .eliminate_constants import EliminateConstants
+
+        self._fold = EliminateConstants(self._ctx)
 
     def __call__(self, node: PsAstNode) -> PsAstNode:
         self._substitutions = dict()
@@ -65,7 +69,8 @@ class LowerToC:
                         return i
 
                 summands: list[PsExpression] = [
-                    maybe_cast(cast(PsExpression, self.visit(idx).clone())) * PsExpression.make(stride)
+                    maybe_cast(cast(PsExpression, self.visit(idx).clone()))
+                    * PsExpression.make(stride)
                     for idx, stride in zip(indices, buf.strides, strict=True)
                 ]
 
@@ -77,9 +82,11 @@ class LowerToC:
 
                 mem_acc = PsMemAcc(bptr.clone(), linearized_idx)
 
-                return self._typify.typify_expression(
-                    mem_acc, target_type=buf.element_type
-                )[0]
+                return self._fold(
+                    self._typify.typify_expression(
+                        mem_acc, target_type=buf.element_type
+                    )[0]
+                )
 
             case PsLookup(aggr, member_name) if isinstance(
                 aggr, PsBufferAcc
@@ -115,10 +122,7 @@ class LowerToC:
                 const=bp_type.const,
                 restrict=bp_type.restrict,
             )
-            type_erased_bp = PsSymbol(
-                bp.name,
-                erased_type
-            )
+            type_erased_bp = PsSymbol(bp.name, erased_type)
             type_erased_bp.add_property(BufferBasePtr(buf))
             self._substitutions[bp] = type_erased_bp
         else:
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index e12a820343fb17530742e7c1c404d6595edc0f9e..9e2af1b7e3cebe1e24b3a103ba72d677fc2d6d38 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -5,7 +5,7 @@ from warnings import warn
 from collections.abc import Collection
 
 from typing import Sequence
-from dataclasses import dataclass, InitVar
+from dataclasses import dataclass, InitVar, replace
 
 from .target import Target
 from .field import Field, FieldType
@@ -410,7 +410,7 @@ class CreateKernelConfig:
             warn(
                 "Setting the deprecated `data_type` will override the value of `default_dtype`. "
                 "Set `default_dtype` instead.",
-                FutureWarning,
+                UserWarning,
             )
             self.default_dtype = data_type
 
@@ -433,7 +433,52 @@ class CreateKernelConfig:
 
         if cpu_vectorize_info is not None:
             _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize")
-            raise NotImplementedError("CPU vectorization is not implemented yet")
+            if "instruction_set" in cpu_vectorize_info:
+                if self.target != Target.GenericCPU:
+                    raise PsOptionsError(
+                        "Setting 'instruction_set' in the deprecated 'cpu_vectorize_info' option is only "
+                        "valid if `target == Target.CPU`."
+                    )
+
+                isa = cpu_vectorize_info["instruction_set"]
+                vec_target: Target
+                match isa:
+                    case "best":
+                        vec_target = Target.available_vector_cpu_targets().pop()
+                    case "sse":
+                        vec_target = Target.X86_SSE
+                    case "avx":
+                        vec_target = Target.X86_AVX
+                    case "avx512":
+                        vec_target = Target.X86_AVX512
+                    case "avx512vl":
+                        vec_target = Target.X86_AVX512 | Target._VL
+                    case _:
+                        raise PsOptionsError(
+                            f'Value {isa} in `cpu_vectorize_info["instruction_set"]` is not supported.'
+                        )
+
+                warn(
+                    f"Value {isa} for `instruction_set` in deprecated `cpu_vectorize_info` "
+                    "will override the `target` option. "
+                    f"Set `target` to {vec_target} instead.",
+                    UserWarning,
+                )
+
+                self.target = vec_target
+
+            deprecated_vec_opts = VectorizationConfig(
+                assume_inner_stride_one=cpu_vectorize_info.get(
+                    "assume_inner_stride_one", False
+                ),
+                assume_aligned=cpu_vectorize_info.get("assume_aligned", False),
+                use_nontemporal_stores=cpu_vectorize_info.get("nontemporal", False),
+            )
+
+            if optim is not None:
+                optim = replace(optim, vectorize=deprecated_vec_opts)
+            else:
+                optim = CpuOptimConfig(vectorize=deprecated_vec_opts)
 
         if optim is not None:
             if self.cpu_optim is not None:
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index ecf6264717358375d4da7f7880e83588bea24790..651a67cf2092a93e9e1cab3f393c2edd1baf15a9 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -144,6 +144,9 @@ class DefaultKernelCreationDriver:
         if self._cfg.target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
 
+        #   Note: After this point, the AST may contain intrinsics, so type-dependent
+        #   transformations cannot be run any more
+
         #   Lowering
         lower_to_c = LowerToC(self._ctx)
         kernel_ast = cast(PsBlock, lower_to_c(kernel_ast))
@@ -151,18 +154,11 @@ class DefaultKernelCreationDriver:
         select_functions = SelectFunctions(self._platform)
         kernel_ast = cast(PsBlock, select_functions(kernel_ast))
 
-        #   Late canonicalization and constant elimination passes
-        #    * Since lowering introduces new index calculations and indexing symbols into the AST,
-        #    * these need to be handled here
+        #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC
 
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
 
-        late_fold_constants = EliminateConstants(
-            self._ctx, extract_constant_exprs=False
-        )
-        kernel_ast = cast(PsBlock, late_fold_constants(kernel_ast))
-
         if self._cfg.target.is_cpu():
             return create_cpu_kernel_function(
                 self._ctx,
diff --git a/tests/nbackend/transformations/test_lower_to_c.py b/tests/nbackend/transformations/test_lower_to_c.py
index b557a7493f9a84cb13b511e8fca1f898823bc9bb..75e6daf4bc67014183f834e5b38fc8245bd13a4f 100644
--- a/tests/nbackend/transformations/test_lower_to_c.py
+++ b/tests/nbackend/transformations/test_lower_to_c.py
@@ -51,14 +51,13 @@ def test_lower_buffer_accesses():
     assert isinstance(fasm_lowered.lhs.pointer, PsSymbolExpr)
     assert fasm_lowered.lhs.pointer.symbol == f_buf.base_pointer
 
-    zero = factory.parse_index(0)
     expected_offset = reduce(
         add,
         (
-            (PsExpression.make(dm.counter) + zero) * PsExpression.make(stride)
+            (PsExpression.make(dm.counter)) * PsExpression.make(stride)
             for dm, stride in zip(ispace.dimensions, f_buf.strides)
         ),
-    ) + factory.parse_index(1) * PsExpression.make(f_buf.strides[-1])
+    ) + PsExpression.make(f_buf.strides[-1])
     assert fasm_lowered.lhs.offset.structurally_equal(expected_offset)
 
     assert isinstance(fasm_lowered.rhs, PsMemAcc)
diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py
index 506e2bf2ca8a2d6b65212f5bea5caf715201773d..5d5dba0eada50ed7c7f2a6f4f3ddb342781e23b6 100644
--- a/tests/test_quicktests.py
+++ b/tests/test_quicktests.py
@@ -9,25 +9,32 @@ def test_basic_kernel():
         dh = ps.create_data_handling(domain_size=domain_shape, periodicity=True)
         assert all(dh.periodicity)
 
-        f = dh.add_array('f', values_per_cell=1)
-        tmp = dh.add_array('tmp', values_per_cell=1)
+        f = dh.add_array("f", values_per_cell=1)
+        tmp = dh.add_array("tmp", values_per_cell=1)
 
         stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)]
-        stencil_3d = [(1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)]
+        stencil_3d = [
+            (1, 0, 0),
+            (-1, 0, 0),
+            (0, 1, 0),
+            (0, -1, 0),
+            (0, 0, 1),
+            (0, 0, -1),
+        ]
         stencil = stencil_2d if dh.dim == 2 else stencil_3d
 
         jacobi = ps.Assignment(tmp.center, sum(f.neighbors(stencil)) / len(stencil))
         kernel = ps.create_kernel(jacobi).compile()
 
         for b in dh.iterate(ghost_layers=1):
-            b['f'].fill(42)
+            b["f"].fill(42)
         dh.run_kernel(kernel)
         for b in dh.iterate(ghost_layers=0):
-            np.testing.assert_equal(b['f'], 42)
+            np.testing.assert_equal(b["f"], 42)
 
         float_seq = [1.0, 2.0, 3.0, 4.0]
         int_seq = [1, 2, 3]
-        for op in ('min', 'max', 'sum'):
+        for op in ("min", "max", "sum"):
             assert (dh.reduce_float_sequence(float_seq, op) == float_seq).all()
             assert (dh.reduce_int_sequence(int_seq, op) == int_seq).all()
 
@@ -37,10 +44,13 @@ def test_basic_blocking_staggered():
     f = ps.fields("f: double[2D]")
     stag = ps.fields("stag(2): double[2D]", field_type=ps.FieldType.STAGGERED)
     terms = [
-       f[0, 0] - f[-1, 0],
-       f[0, 0] - f[0, -1],
+        f[0, 0] - f[-1, 0],
+        f[0, 0] - f[0, -1],
+    ]
+    assignments = [
+        ps.Assignment(stag.staggered_access(d), terms[i])
+        for i, d in enumerate(stag.staggered_stencil)
     ]
-    assignments = [ps.Assignment(stag.staggered_access(d), terms[i]) for i, d in enumerate(stag.staggered_stencil)]
     kernel = ps.create_staggered_kernel(assignments, cpu_blocking=(3, 16)).compile()
     reference_kernel = ps.create_staggered_kernel(assignments).compile()
 
@@ -52,24 +62,27 @@ def test_basic_blocking_staggered():
     np.testing.assert_almost_equal(stag_arr, stag_ref)
 
 
-@pytest.mark.xfail(reason="Vectorization not implemented yet")
 def test_basic_vectorization():
-    supported_instruction_sets = get_supported_instruction_sets()
-    if supported_instruction_sets:
-        instruction_set = supported_instruction_sets[-1]
-    else:
-        instruction_set = None
+    target = ps.Target.auto_cpu()
+    if not target.is_vector_cpu():
+        pytest.skip("No vector CPU available")
 
     f, g = ps.fields("f, g : double[2D]")
-    update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)]
-    ast = ps.create_kernel(update_rule)
+    update_rule = [
+        ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)
+    ]
+    ast = ps.create_kernel(
+        update_rule,
+        target=target,
+        cpu_optim=ps.CpuOptimConfig(
+            vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
+        ),
+    )
 
-    replace_inner_stride_with_one(ast)
-    vectorize(ast, instruction_set=instruction_set)
     func = ast.compile()
 
     arr = np.ones((23 + 2, 17 + 2)) * 5.0
     dst = np.zeros_like(arr)
 
     func(g=dst, f=arr)
-    np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)
\ No newline at end of file
+    np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)