Daniel Bauer · Frederik Hennig · Frederik Hennig · 4f8e42e6 · 4f8e42e6 · 4f8e42e6
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -57,7 +57,7 @@ class GenericCpu(Platform):

    def select_function(self, call: PsCall) -> PsExpression:
        assert isinstance(call.function, PsMathFunction)
-        
+
        func = call.function.func
        dtype = call.get_dtype()
        arg_types = (dtype,) * func.num_args
@@ -87,13 +87,13 @@ class GenericCpu(Platform):

            call.function = cfunc
            return call
-                
+
        if isinstance(dtype, PsIntegerType):
            match func:
                case MathFunctions.Abs:
                    zero = PsExpression.make(PsConstant(0, dtype))
                    arg = call.args[0]
-                    return PsTernary(PsGe(arg, zero), arg, - arg)
+                    return PsTernary(PsGe(arg, zero), arg, -arg)
                case MathFunctions.Min:
                    arg1, arg2 = call.args
                    return PsTernary(PsLe(arg1, arg2), arg1, arg2)
@@ -131,7 +131,10 @@ class GenericCpu(Platform):
                PsLookup(
                    PsBufferAcc(
                        ispace.index_list.base_pointer,
-                        (PsExpression.make(ispace.sparse_counter), factory.parse_index(0)),
+                        (
+                            PsExpression.make(ispace.sparse_counter),
+                            factory.parse_index(0),
+                        ),
                    ),
                    coord.name,
                ),
@@ -158,26 +161,33 @@ class GenericVectorCpu(GenericCpu, ABC):
    @abstractmethod
    def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
        """Return the intrinsic vector type for the given generic vector type,
-        or raise an `MaterializationError` if type is not supported."""
+        or raise a `MaterializationError` if type is not supported."""

    @abstractmethod
    def constant_intrinsic(self, c: PsConstant) -> PsExpression:
        """Return an expression that initializes a constant vector,
-        or raise an `MaterializationError` if not supported."""
+        or raise a `MaterializationError` if not supported."""

    @abstractmethod
    def op_intrinsic(
        self, expr: PsExpression, operands: Sequence[PsExpression]
    ) -> PsExpression:
        """Return an expression intrinsically invoking the given operation
-        or raise an `MaterializationError` if not supported."""
+        or raise a `MaterializationError` if not supported."""
+
+    @abstractmethod
+    def math_func_intrinsic(
+        self, expr: PsCall, operands: Sequence[PsExpression]
+    ) -> PsExpression:
+        """Return an expression intrinsically invoking the given mathematical
+        function or raise a `MaterializationError` if not supported."""

    @abstractmethod
    def vector_load(self, acc: PsVecMemAcc) -> PsExpression:
        """Return an expression intrinsically performing a vector load,
-        or raise an `MaterializationError` if not supported."""
+        or raise a `MaterializationError` if not supported."""

    @abstractmethod
    def vector_store(self, acc: PsVecMemAcc, arg: PsExpression) -> PsExpression:
        """Return an expression intrinsically performing a vector store,
-        or raise an `MaterializationError` if not supported."""
+        or raise a `MaterializationError` if not supported."""
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -13,7 +13,9 @@ from ..ast.expressions import (
    PsSub,
    PsMul,
    PsDiv,
-    PsConstantExpr
+    PsConstantExpr,
+    PsCast,
+    PsCall,
 )
 from ..ast.vector import PsVecMemAcc, PsVecBroadcast
 from ...types import PsCustomType, PsVectorType, PsPointerType
@@ -23,15 +25,15 @@ from ..exceptions import MaterializationError
 from .generic_cpu import GenericVectorCpu
 from ..kernelcreation import KernelCreationContext

-from ...types.quick import Fp, SInt
-from ..functions import CFunction
+from ...types.quick import Fp, UInt, SInt
+from ..functions import CFunction, PsMathFunction, MathFunctions


 class X86VectorArch(Enum):
    SSE = 128
    AVX = 256
    AVX512 = 512
-    AVX512_FP16 = AVX512 + 1    # TODO improve modelling?
+    AVX512_FP16 = AVX512 + 1  # TODO improve modelling?

    def __ge__(self, other: X86VectorArch) -> bool:
        return self.value >= other.value
@@ -78,7 +80,7 @@ class X86VectorArch(Enum):
                )

        return suffix
-    
+
    def intrin_type(self, vtype: PsVectorType):
        scalar_type = vtype.scalar_type
        match scalar_type:
@@ -96,9 +98,7 @@ class X86VectorArch(Enum):
                )

        if vtype.width > self.max_vector_width:
-            raise MaterializationError(
-                f"x86/{self} does not support {vtype}"
-            )
+            raise MaterializationError(f"x86/{self} does not support {vtype}")
        return PsCustomType(f"__m{vtype.width}{suffix}")


@@ -161,26 +161,124 @@ class X86VectorCpu(GenericVectorCpu):
        match expr:
            case PsUnOp() | PsBinOp():
                func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
-                return func(*operands)
+                intrinsic = func(*operands)
+                intrinsic.dtype = func.return_type
+                return intrinsic
            case _:
                raise MaterializationError(f"Cannot map {type(expr)} to x86 intrinsic")

+    def math_func_intrinsic(
+        self, expr: PsCall, operands: Sequence[PsExpression]
+    ) -> PsExpression:
+        assert isinstance(expr.function, PsMathFunction)
+
+        vtype = expr.get_dtype()
+        assert isinstance(vtype, PsVectorType)
+
+        prefix = self._vector_arch.intrin_prefix(vtype)
+        suffix = self._vector_arch.intrin_suffix(vtype)
+        rtype = atype = self._vector_arch.intrin_type(vtype)
+
+        match expr.function.func:
+            case (
+                MathFunctions.Exp
+                | MathFunctions.Log
+                | MathFunctions.Sin
+                | MathFunctions.Cos
+                | MathFunctions.Tan
+                | MathFunctions.Sinh
+                | MathFunctions.Cosh
+                | MathFunctions.ASin
+                | MathFunctions.ACos
+                | MathFunctions.ATan
+                | MathFunctions.ATan2
+                | MathFunctions.Pow
+            ):
+                raise MaterializationError(
+                    "Trigonometry, exp, log, and pow require SVML."
+                )
+
+            case MathFunctions.Floor | MathFunctions.Ceil if vtype.is_float():
+                opstr = expr.function.func.function_name
+                if vtype.width > 256:
+                    raise MaterializationError("512bit ceil/floor require SVML.")
+
+            case MathFunctions.Min | MathFunctions.Max:
+                opstr = expr.function.func.function_name
+                if (
+                    vtype.is_int()
+                    and vtype.scalar_type.width == 64
+                    and self._vector_arch < X86VectorArch.AVX512
+                ):
+                    raise MaterializationError(
+                        "64bit integer (signed and unsigned) min/max intrinsics require AVX512."
+                    )
+
+            case MathFunctions.Abs:
+                assert len(operands) == 1, "abs takes exactly one argument."
+                op = operands[0]
+
+                match vtype.scalar_type:
+                    case UInt():
+                        return op
+
+                    case SInt(width):
+                        opstr = expr.function.func.function_name
+                        if width == 64 and self._vector_arch < X86VectorArch.AVX512:
+                            raise MaterializationError(
+                                "64bit integer abs intrinsic requires AVX512."
+                            )
+
+                    case Fp():
+                        neg_zero = self.constant_intrinsic(PsConstant(-0.0, vtype))
+
+                        opstr = "andnot"
+                        func = CFunction(
+                            f"{prefix}_{opstr}_{suffix}", (atype,) * 2, rtype
+                        )
+
+                        return func(neg_zero, op)
+
+            case _:
+                raise MaterializationError(
+                    f"x86/{self} does not support {expr.function.func.function_name} on type {vtype}."
+                )
+
+        if expr.function.func in [
+            MathFunctions.ATan2,
+            MathFunctions.Min,
+            MathFunctions.Max,
+        ]:
+            num_args = 2
+        else:
+            num_args = 1
+
+        func = CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
+        return func(*operands)
+
    def vector_load(self, acc: PsVecMemAcc) -> PsExpression:
        if acc.stride is None:
-            load_func = _x86_packed_load(self._vector_arch, acc.dtype, False)
-            return load_func(
-                PsAddressOf(PsMemAcc(acc.pointer, acc.offset))
-            )
+            load_func, addr_type = _x86_packed_load(self._vector_arch, acc.dtype, False)
+            addr: PsExpression = PsAddressOf(PsMemAcc(acc.pointer, acc.offset))
+            if addr_type:
+                addr = PsCast(addr_type, addr)
+            intrinsic = load_func(addr)
+            intrinsic.dtype = load_func.return_type
+            return intrinsic
        else:
            raise NotImplementedError("Gather loads not implemented yet.")

    def vector_store(self, acc: PsVecMemAcc, arg: PsExpression) -> PsExpression:
        if acc.stride is None:
-            store_func = _x86_packed_store(self._vector_arch, acc.dtype, False)
-            return store_func(
-                PsAddressOf(PsMemAcc(acc.pointer, acc.offset)),
-                arg,
+            store_func, addr_type = _x86_packed_store(
+                self._vector_arch, acc.dtype, False
            )
+            addr: PsExpression = PsAddressOf(PsMemAcc(acc.pointer, acc.offset))
+            if addr_type:
+                addr = PsCast(addr_type, addr)
+            intrinsic = store_func(addr, arg)
+            intrinsic.dtype = store_func.return_type
+            return intrinsic
        else:
            raise NotImplementedError("Scatter stores not implemented yet.")

@@ -188,26 +286,46 @@ class X86VectorCpu(GenericVectorCpu):
 @cache
 def _x86_packed_load(
    varch: X86VectorArch, vtype: PsVectorType, aligned: bool
-) -> CFunction:
+) -> tuple[CFunction, PsPointerType | None]:
    prefix = varch.intrin_prefix(vtype)
-    suffix = varch.intrin_suffix(vtype)
    ptr_type = PsPointerType(vtype.scalar_type, const=True)
-    return CFunction(
-        f"{prefix}_load{'' if aligned else 'u'}_{suffix}", (ptr_type,), vtype
+
+    if isinstance(vtype.scalar_type, SInt):
+        suffix = f"si{vtype.width}"
+        addr_type = PsPointerType(varch.intrin_type(vtype))
+    else:
+        suffix = varch.intrin_suffix(vtype)
+        addr_type = None
+
+    return (
+        CFunction(
+            f"{prefix}_load{'' if aligned else 'u'}_{suffix}", (ptr_type,), vtype
+        ),
+        addr_type,
    )


 @cache
 def _x86_packed_store(
    varch: X86VectorArch, vtype: PsVectorType, aligned: bool
-) -> CFunction:
+) -> tuple[CFunction, PsPointerType | None]:
    prefix = varch.intrin_prefix(vtype)
-    suffix = varch.intrin_suffix(vtype)
    ptr_type = PsPointerType(vtype.scalar_type, const=True)
-    return CFunction(
-        f"{prefix}_store{'' if aligned else 'u'}_{suffix}",
-        (ptr_type, vtype),
-        PsCustomType("void"),
+
+    if isinstance(vtype.scalar_type, SInt):
+        suffix = f"si{vtype.width}"
+        addr_type = PsPointerType(varch.intrin_type(vtype))
+    else:
+        suffix = varch.intrin_suffix(vtype)
+        addr_type = None
+
+    return (
+        CFunction(
+            f"{prefix}_store{'' if aligned else 'u'}_{suffix}",
+            (ptr_type, vtype),
+            PsCustomType("void"),
+        ),
+        addr_type,
    )


@@ -223,7 +341,7 @@ def _x86_op_intrin(
        case PsVecBroadcast():
            opstr = "set1"
            if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
-                suffix += "x"   
+                suffix += "x"
            atype = vtype.scalar_type
        case PsAdd():
            opstr = "add"
@@ -239,8 +357,52 @@ def _x86_op_intrin(
            opstr = "mul"
        case PsDiv():
            opstr = "div"
+        case PsCast(target_type, arg):
+            atype = arg.dtype
+            widest_type = max(vtype, atype, key=lambda t: t.width)
+
+            assert target_type == vtype, "type mismatch"
+            assert isinstance(atype, PsVectorType)
+
+            def panic(detail: str = ""):
+                raise MaterializationError(
+                    f"Unable to select intrinsic for type conversion: "
+                    f"{varch.name} does not support packed conversion from {atype} to {target_type}. {detail}\n"
+                    f"    at: {op}"
+                )
+
+            if atype == vtype:
+                panic("Use `EliminateConstants` to eliminate trivial casts.")
+
+            match (atype.scalar_type, vtype.scalar_type):
+                # Not supported: cvtepi8_pX, cvtpX_epi8, cvtepi16_p[sd], cvtp[sd]_epi16
+                case (
+                    (SInt(8), Fp())
+                    | (Fp(), SInt(8))
+                    | (SInt(16), Fp(32))
+                    | (SInt(16), Fp(64))
+                    | (Fp(32), SInt(16))
+                    | (Fp(64), SInt(16))
+                ):
+                    panic()
+                # AVX512 only: cvtepi64_pX, cvtpX_epi64
+                case (SInt(64), Fp()) | (
+                    Fp(),
+                    SInt(64),
+                ) if varch < X86VectorArch.AVX512:
+                    panic()
+                # AVX512 only: cvtepiA_epiT if A > T
+                case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512:
+                    panic()
+
+                case _:
+                    prefix = varch.intrin_prefix(widest_type)
+                    opstr = f"cvt{varch.intrin_suffix(atype)}"
+
        case _:
-            raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}")
+            raise MaterializationError(
+                f"Unable to select operation intrinsic for {type(op)}"
+            )

    num_args = 1 if isinstance(op, PsUnOp) else 2
    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
--- a/src/pystencils/backend/transformations/ast_vectorizer.py
+++ b/src/pystencils/backend/transformations/ast_vectorizer.py
 from __future__ import annotations
-from typing import overload
+from textwrap import indent
+from typing import cast, overload

 from dataclasses import dataclass

@@ -11,7 +12,13 @@ from ..constants import PsConstant
 from ..functions import PsMathFunction

 from ..ast import PsAstNode
-from ..ast.structural import PsBlock, PsDeclaration, PsAssignment
+from ..ast.structural import (
+    PsBlock,
+    PsDeclaration,
+    PsAssignment,
+    PsLoop,
+    PsEmptyLeafMixIn,
+)
 from ..ast.expressions import (
    PsExpression,
    PsAddressOf,
@@ -24,6 +31,7 @@ from ..ast.expressions import (
    PsCall,
    PsMemAcc,
    PsBufferAcc,
+    PsSubscript,
    PsAdd,
    PsMul,
    PsSub,
@@ -148,6 +156,10 @@ class VectorizationContext:
            )
        return PsVectorType(scalar_type, self._lanes)

+    def axis_ctr_dependees(self, symbols: set[PsSymbol]) -> set[PsSymbol]:
+        """Returns all symbols in `symbols` that depend on the axis counter."""
+        return symbols & (self.vectorized_symbols.keys() | {self.axis.counter})
+

 @dataclass
 class Affine:
@@ -246,7 +258,7 @@ class AstVectorizer:

    def __call__(self, node: PsAstNode, vc: VectorizationContext) -> PsAstNode:
        """Perform subtree vectorization.
-        
+
        Args:
            node: Root of the subtree that should be vectorized
            vc: Object describing the current vectorization context
@@ -273,6 +285,14 @@ class AstVectorizer:
                return PsDeclaration(vec_lhs, vec_rhs)

            case PsAssignment(lhs, rhs):
+                if (
+                    isinstance(lhs, PsSymbolExpr)
+                    and lhs.symbol in vc.vectorized_symbols
+                ):
+                    return PsAssignment(
+                        self.visit_expr(lhs, vc), self.visit_expr(rhs, vc)
+                    )
+
                if not isinstance(lhs, (PsMemAcc, PsBufferAcc)):
                    raise VectorizationError(f"Unable to vectorize assignment to {lhs}")

@@ -286,6 +306,29 @@ class AstVectorizer:
                rhs_vec = self.visit_expr(rhs, vc)
                return PsAssignment(lhs_vec, rhs_vec)

+            case PsLoop(counter, start, stop, step, body):
+                # Check that loop bounds are lane-invariant
+                free_symbols = (
+                    self._collect_symbols(start)
+                    | self._collect_symbols(stop)
+                    | self._collect_symbols(step)
+                )
+                vec_dependencies = vc.axis_ctr_dependees(free_symbols)
+                if vec_dependencies:
+                    raise VectorizationError(
+                        "Unable to vectorize loop depending on vectorized symbols:\n"
+                        f"  Offending dependencies:\n"
+                        f"    {vec_dependencies}\n"
+                        f"  Found in loop:\n"
+                        f"{indent(str(node), '   ')}"
+                    )
+
+                vectorized_body = cast(PsBlock, self.visit(body, vc))
+                return PsLoop(counter, start, stop, step, vectorized_body)
+
+            case PsEmptyLeafMixIn():
+                return node
+
            case _:
                raise NotImplementedError(f"Vectorization of {node} is not implemented")

@@ -426,6 +469,23 @@ class AstVectorizer:
                    #   Buffer access is lane-invariant
                    vec_expr = PsVecBroadcast(vc.lanes, expr.clone())

+            case PsSubscript(array, index):
+                # Check that array expression and indices are lane-invariant
+                free_symbols = self._collect_symbols(array).union(
+                    *[self._collect_symbols(i) for i in index]
+                )
+                vec_dependencies = vc.axis_ctr_dependees(free_symbols)
+                if vec_dependencies:
+                    raise VectorizationError(
+                        "Unable to vectorize array subscript depending on vectorized symbols:\n"
+                        f"  Offending dependencies:\n"
+                        f"    {vec_dependencies}\n"
+                        f"  Found in expression:\n"
+                        f"{indent(str(expr), '   ')}"
+                    )
+
+                vec_expr = PsVecBroadcast(vc.lanes, expr.clone())
+
            case _:
                raise NotImplementedError(
                    f"Vectorization of {type(expr)} is not implemented"

--- a/src/pystencils/backend/transformations/eliminate_constants.py
+++ b/src/pystencils/backend/transformations/eliminate_constants.py
@@ -40,13 +40,7 @@ from ..ast.util import AstEqWrapper
 from ..constants import PsConstant
 from ..memory import PsSymbol
 from ..functions import PsMathFunction
-from ...types import (
-    PsNumericType,
-    PsBoolType,
-    PsScalarType,
-    PsVectorType,
-    constify
-)
+from ...types import PsNumericType, PsBoolType, PsScalarType, PsVectorType, constify


 __all__ = ["EliminateConstants"]
@@ -261,6 +255,9 @@ class EliminateConstants:
                assert isinstance(target_type, PsNumericType)
                return PsConstantExpr(c.reinterpret_as(target_type)), True

+            case PsCast(target_type, op) if target_type == op.get_dtype():
+                return op, all(subtree_constness)
+
            case PsVecBroadcast(lanes, PsConstantExpr(c)):
                scalar_type = c.get_dtype()
                assert isinstance(scalar_type, PsScalarType)
@@ -358,7 +355,10 @@ class EliminateConstants:
                                    from ...utils import c_intdiv

                                    folded = PsConstant(c_intdiv(v1, v2), dtype)
-                                elif isinstance(dtype, PsNumericType) and dtype.is_float():
+                                elif (
+                                    isinstance(dtype, PsNumericType)
+                                    and dtype.is_float()
+                                ):
                                    folded = PsConstant(v1 / v2, dtype)

                            if folded is not None:

--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -4,11 +4,12 @@ from typing import cast
 from ..kernelcreation import KernelCreationContext
 from ..memory import PsSymbol
 from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement
-from ..ast.expressions import PsExpression
-from ...types import PsVectorType, constify, deconstify
+from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral
+from ...types import PsCustomType, PsVectorType, constify, deconstify
 from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp
 from ..ast.vector import PsVecMemAcc
 from ..exceptions import MaterializationError
+from ..functions import CFunction, PsMathFunction

 from ..platforms import GenericVectorCpu

@@ -39,22 +40,34 @@ class SelectionContext:

 class SelectIntrinsics:
    """Lower IR vector types to intrinsic vector types, and IR vector operations to intrinsic vector operations.
-    
+
    This transformation will replace all vectorial IR elements by conforming implementations using
    compiler intrinsics for the given execution platform.

    Args:
        ctx: The current kernel creation context
        platform: Platform object representing the target hardware, which provides the intrinsics
+        use_builtin_convertvector: If `True`, type conversions between SIMD
+            vectors use the compiler builtin ``__builtin_convertvector``
+            instead of instrinsics. It is supported by Clang >= 3.7, GCC >= 9.1,
+            and ICX. Not supported by ICC or MSVC. Activate if you need type
+            conversions not natively supported by your CPU, e.g. conversion from
+            64bit integer to double on an x86 AVX machine. Defaults to `False`.

    Raises:
        MaterializationError: If a vector type or operation cannot be represented by intrinsics
            on the given platform
    """

-    def __init__(self, ctx: KernelCreationContext, platform: GenericVectorCpu):
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        platform: GenericVectorCpu,
+        use_builtin_convertvector: bool = False,
+    ):
        self._ctx = ctx
        self._platform = platform
+        self._use_builtin_convertvector = use_builtin_convertvector

    def __call__(self, node: PsAstNode) -> PsAstNode:
        return self.visit(node, SelectionContext(self._ctx, self._platform))
@@ -68,11 +81,11 @@ class SelectIntrinsics:
                lhs_new = cast(PsSymbolExpr, self.visit_expr(lhs, sc))
                rhs_new = self.visit_expr(rhs, sc)
                return PsDeclaration(lhs_new, rhs_new)
-            
-            case PsAssignment(lhs, rhs) if isinstance(lhs, PsVecMemAcc): 
+
+            case PsAssignment(lhs, rhs) if isinstance(lhs, PsVecMemAcc):
                new_rhs = self.visit_expr(rhs, sc)
                return PsStatement(self._platform.vector_store(lhs, new_rhs))
-            
+
            case _:
                node.children = [self.visit(c, sc) for c in node.children]

@@ -89,6 +102,22 @@ class SelectIntrinsics:
            case PsConstantExpr(c):
                return self._platform.constant_intrinsic(c)

+            case PsCast(target_type, operand) if self._use_builtin_convertvector:
+                assert isinstance(target_type, PsVectorType)
+                op = self.visit_expr(operand, sc)
+
+                rtype = PsCustomType(
+                    f"{target_type.scalar_type.c_string()} __attribute__((__vector_size__({target_type.itemsize})))"
+                )
+                target_type_literal = PsExpression.make(PsLiteral(rtype.name, rtype))
+
+                func = CFunction(
+                    "__builtin_convertvector", (op.get_dtype(), rtype), target_type
+                )
+                intrinsic = func(op, target_type_literal)
+                intrinsic.dtype = func.return_type
+                return intrinsic
+
            case PsUnOp(operand):
                op = self.visit_expr(operand, sc)
                return self._platform.op_intrinsic(expr, [op])
@@ -102,6 +131,10 @@ class SelectIntrinsics:
            case PsVecMemAcc():
                return self._platform.vector_load(expr)

+            case PsCall(function, args) if isinstance(function, PsMathFunction):
+                arguments = [self.visit_expr(a, sc) for a in args]
+                return self._platform.math_func_intrinsic(expr, arguments)
+
            case _:
                raise MaterializationError(
                    f"Unable to select intrinsic implementation for {expr}"

--- a/src/pystencils/codegen/target.py
+++ b/src/pystencils/codegen/target.py
@@ -115,6 +115,18 @@ class Target(Flag):
            return avail_targets.pop()
        else:
            return Target.GenericCPU
+        
+    @staticmethod
+    def available_targets() -> list[Target]:
+        targets = [Target.GenericCPU]
+        try:
+            import cupy  # noqa: F401
+            targets.append(Target.CUDA)
+        except ImportError:
+            pass
+
+        targets += Target.available_vector_cpu_targets()
+        return targets

    @staticmethod
    def available_vector_cpu_targets() -> list[Target]:

--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -31,11 +31,13 @@ except ImportError:
 AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
 TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]

+
 @pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
 def target(request) -> ps.Target:
    """Provides all code generation targets available on the current hardware"""
    return request.param

+
 @pytest.fixture
 def gen_config(target: ps.Target):
    """Default codegen configuration for the current target.
@@ -56,6 +58,7 @@ def gen_config(target: ps.Target):

    return gen_config

+
 @pytest.fixture()
 def xp(target: ps.Target) -> ModuleType:
    """Primary array module for the current target.

--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
 import sympy as sp
 import numpy as np
 import pytest
-
-from pystencils import fields, create_kernel, CreateKernelConfig, Target, Assignment, Field
+from dataclasses import replace
+from itertools import product
+
+from pystencils import (
+    fields,
+    create_kernel,
+    CreateKernelConfig,
+    Target,
+    Assignment,
+    Field,
+)
 from pystencils.backend.ast import dfs_preorder
 from pystencils.backend.ast.expressions import PsCall

@@ -34,38 +43,42 @@ def binary_function(name, xp):
    }[name]


-@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA))
+AVAIL_TARGETS = Target.available_targets()
+
+
 @pytest.mark.parametrize(
-    "function_name",
-    (
-        "exp",
-        "log",
-        "sin",
-        "cos",
-        "tan",
-        "sinh",
-        "cosh",
-        "asin",
-        "acos",
-        "atan",
-        "abs",
-        "floor",
-        "ceil",
-    ),
+    "function_name, target",
+    list(
+        product(
+            (
+                "exp",
+                "log",
+                "sin",
+                "cos",
+                "tan",
+                "sinh",
+                "cosh",
+                "asin",
+                "acos",
+                "atan",
+            ),
+            [t for t in AVAIL_TARGETS if Target._X86 not in t],
+        )
+    )
+    + list(
+        product(
+            ["floor", "ceil"], [t for t in AVAIL_TARGETS if Target._AVX512 not in t]
+        )
+    )
+    + list(product(["abs"], AVAIL_TARGETS)),
 )
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
-def test_unary_functions(target, function_name, dtype):
-    if target == Target.CUDA:
-        xp = pytest.importorskip("cupy")
-    else:
-        xp = np
-
+def test_unary_functions(gen_config, xp, function_name, dtype):
    sp_func, xp_func = unary_function(function_name, xp)
    resolution = np.finfo(dtype).resolution

-    inp = xp.array(
-        [[0.1, 0.2, 0.3], [-0.8, -1.6, -12.592], [xp.pi, xp.e, 0.0]], dtype=dtype
-    )
+    #   Array size should be larger than eight, such that vectorized kernels don't just run their remainder loop
+    inp = xp.array([0.1, 0.2, 0.0, -0.8, -1.6, -12.592, xp.pi, xp.e, -0.3], dtype=dtype)
    outp = xp.zeros_like(inp)

    reference = xp_func(inp)
@@ -74,7 +87,7 @@ def test_unary_functions(target, function_name, dtype):
    outp_field = inp_field.new_field_with_different_name("outp")

    asms = [Assignment(outp_field.center(), sp_func(inp_field.center()))]
-    gen_config = CreateKernelConfig(target=target, default_dtype=dtype)
+    gen_config = replace(gen_config, default_dtype=dtype)

    kernel = create_kernel(asms, gen_config)
    kfunc = kernel.compile()
@@ -83,28 +96,26 @@ def test_unary_functions(target, function_name, dtype):
    xp.testing.assert_allclose(outp, reference, rtol=resolution)


-@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA))
-@pytest.mark.parametrize("function_name", ("min", "max", "pow", "atan2"))
+@pytest.mark.parametrize(
+    "function_name,target",
+    list(product(["min", "max"], AVAIL_TARGETS))
+    + list(
+        product(["pow", "atan2"], [t for t in AVAIL_TARGETS if Target._X86 not in t])
+    ),
+)
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
-def test_binary_functions(target, function_name, dtype):
-    if target == Target.CUDA:
-        xp = pytest.importorskip("cupy")
-    else:
-        xp = np
-
-    sp_func, np_func = binary_function(function_name, xp)
+def test_binary_functions(gen_config, xp, function_name, dtype):
+    sp_func, xp_func = binary_function(function_name, xp)
    resolution: dtype = np.finfo(dtype).resolution

-    inp = xp.array(
-        [[0.1, 0.2, 0.3], [-0.8, -1.6, -12.592], [xp.pi, xp.e, 0.0]], dtype=dtype
-    )
+    inp = xp.array([0.1, 0.2, 0.3, -0.8, -1.6, -12.592, xp.pi, xp.e, 0.0], dtype=dtype)
    inp2 = xp.array(
-        [[3.1, -0.5, 21.409], [11.0, 1.0, -14e3], [2.0 * xp.pi, -xp.e, 0.0]],
+        [3.1, -0.5, 21.409, 11.0, 1.0, -14e3, 2.0 * xp.pi, -xp.e, 0.0],
        dtype=dtype,
    )
    outp = xp.zeros_like(inp)

-    reference = np_func(inp, inp2)
+    reference = xp_func(inp, inp2)

    inp_field = Field.create_from_numpy_array("inp", inp)
    inp2_field = Field.create_from_numpy_array("inp2", inp)
@@ -115,7 +126,7 @@ def test_binary_functions(target, function_name, dtype):
            outp_field.center(), sp_func(inp_field.center(), inp2_field.center())
        )
    ]
-    gen_config = CreateKernelConfig(target=target, default_dtype=dtype)
+    gen_config = replace(gen_config, default_dtype=dtype)

    kernel = create_kernel(asms, gen_config)
    kfunc = kernel.compile()
@@ -124,26 +135,107 @@ def test_binary_functions(target, function_name, dtype):
    xp.testing.assert_allclose(outp, reference, rtol=resolution)


-@pytest.mark.parametrize('a', [sp.Symbol('a'), fields('a:  float64[2d]').center])
+dtype_and_target_for_integer_funcs = pytest.mark.parametrize(
+    "dtype, target",
+    list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA]))
+    + list(
+        product(
+            [np.int64],
+            [
+                t
+                for t in AVAIL_TARGETS
+                if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA)
+            ],
+        )
+    ),
+)
+
+
+@dtype_and_target_for_integer_funcs
+def test_integer_abs(gen_config, xp, dtype):
+    sp_func, xp_func = unary_function("abs", xp)
+
+    smallest = np.iinfo(dtype).min
+    largest = np.iinfo(dtype).max
+
+    inp = xp.array([-1, 0, 1, 3, -5, -312, smallest + 1, largest], dtype=dtype)
+
+    outp = xp.zeros_like(inp)
+    reference = xp_func(inp)
+
+    inp_field = Field.create_from_numpy_array("inp", inp)
+    outp_field = inp_field.new_field_with_different_name("outp")
+
+    asms = [Assignment(outp_field.center(), sp_func(inp_field.center()))]
+    gen_config = replace(gen_config, default_dtype=dtype)
+
+    kernel = create_kernel(asms, gen_config)
+    kfunc = kernel.compile()
+    kfunc(inp=inp, outp=outp)
+
+    xp.testing.assert_array_equal(outp, reference)
+
+
+@pytest.mark.parametrize("function_name", ("min", "max"))
+@dtype_and_target_for_integer_funcs
+def test_integer_binary_functions(gen_config, xp, function_name, dtype):
+    sp_func, xp_func = binary_function(function_name, xp)
+
+    smallest = np.iinfo(dtype).min
+    largest = np.iinfo(dtype).max
+
+    inp1 = xp.array([-1, 0, 1, 3, -5, -312, smallest + 1, largest], dtype=dtype)
+    inp2 = xp.array([3, -5, 1, 12, 1, 11, smallest + 42, largest - 3], dtype=dtype)
+
+    outp = xp.zeros_like(inp1)
+    reference = xp_func(inp1, inp2)
+
+    inp_field = Field.create_from_numpy_array("inp1", inp1)
+    inp2_field = Field.create_from_numpy_array("inp2", inp2)
+    outp_field = inp_field.new_field_with_different_name("outp")
+
+    asms = [
+        Assignment(
+            outp_field.center(), sp_func(inp_field.center(), inp2_field.center())
+        )
+    ]
+    gen_config = replace(gen_config, default_dtype=dtype)
+
+    kernel = create_kernel(asms, gen_config)
+    kfunc = kernel.compile()
+    kfunc(inp1=inp1, inp2=inp2, outp=outp)
+
+    xp.testing.assert_array_equal(outp, reference)
+
+
+@pytest.mark.parametrize("a", [sp.Symbol("a"), fields("a:  float64[2d]").center])
 def test_avoid_pow(a):
-    x = fields('x:  float64[2d]')
+    x = fields("x:  float64[2d]")

-    up = Assignment(x.center_vector[0], 2 * a ** 2 / 3)
+    up = Assignment(x.center_vector[0], 2 * a**2 / 3)
    func = create_kernel(up)

-    powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name))
+    powers = list(
+        dfs_preorder(
+            func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name
+        )
+    )
    assert not powers


 @pytest.mark.xfail(reason="fast_div not available yet")
 def test_avoid_pow_fast_div():
-    x = fields('x:  float64[2d]')
-    a = fields('a:  float64[2d]').center
+    x = fields("x:  float64[2d]")
+    a = fields("a:  float64[2d]").center

    up = Assignment(x.center_vector[0], fast_division(1, (a**2)))
    func = create_kernel(up, config=CreateKernelConfig(target=Target.GPU))

-    powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name))
+    powers = list(
+        dfs_preorder(
+            func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name
+        )
+    )
    assert not powers


@@ -151,14 +243,23 @@ def test_avoid_pow_move_constants():
    # At the end of the kernel creation the function move_constants_before_loop will be called
    # This function additionally contains substitutions for symbols with the same value
    # Thus it simplifies the equations again
-    x = fields('x:  float64[2d]')
+    x = fields("x:  float64[2d]")
    a, b, c = sp.symbols("a, b, c")

-    up = [Assignment(a, 0.0),
-          Assignment(b, 0.0),
-          Assignment(c, 0.0),
-          Assignment(x.center_vector[0], a**2/18 - a*b/6 - a/18 + b**2/18 + b/18 - c**2/36)]
+    up = [
+        Assignment(a, 0.0),
+        Assignment(b, 0.0),
+        Assignment(c, 0.0),
+        Assignment(
+            x.center_vector[0],
+            a**2 / 18 - a * b / 6 - a / 18 + b**2 / 18 + b / 18 - c**2 / 36,
+        ),
+    ]
    func = create_kernel(up)

-    powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name))
+    powers = list(
+        dfs_preorder(
+            func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name
+        )
+    )
    assert not powers
--- a/tests/kernelcreation/test_type_cast.py
+++ b/tests/kernelcreation/test_type_cast.py
+import numpy as np
+import pytest
+from itertools import product
+
+from pystencils import (
+    create_kernel,
+    Target,
+    Assignment,
+    Field,
+)
+from pystencils.sympyextensions.typed_sympy import CastFunc
+
+
+AVAIL_TARGETS_NO_SSE = [t for t in Target.available_targets() if Target._SSE not in t]
+
+
+target_and_dtype = pytest.mark.parametrize(
+    "target, from_type, to_type",
+    list(
+        product(
+            [
+                t
+                for t in AVAIL_TARGETS_NO_SSE
+                if Target._X86 in t and Target._AVX512 not in t
+            ],
+            [np.int32, np.float32, np.float64],
+            [np.int32, np.float32, np.float64],
+        )
+    )
+    + list(
+        product(
+            [
+                t
+                for t in AVAIL_TARGETS_NO_SSE
+                if Target._X86 not in t or Target._AVX512 in t
+            ],
+            [np.int32, np.int64, np.float32, np.float64],
+            [np.int32, np.int64, np.float32, np.float64],
+        )
+    ),
+)
+
+
+@target_and_dtype
+def test_type_cast(gen_config, xp, from_type, to_type):
+    if np.issubdtype(from_type, np.floating):
+        inp = xp.array([-1.25, -0, 1.5, 3, -5, -312, 42, 6.625, -9], dtype=from_type)
+    else:
+        inp = xp.array([-1, 0, 1, 3, -5, -312, 42, 6, -9], dtype=from_type)
+
+    outp = xp.zeros_like(inp).astype(to_type)
+    truncated = inp.astype(to_type)
+    rounded = xp.round(inp).astype(to_type)
+
+    inp_field = Field.create_from_numpy_array("inp", inp)
+    outp_field = Field.create_from_numpy_array("outp", outp)
+
+    asms = [Assignment(outp_field.center(), CastFunc(inp_field.center(), to_type))]
+
+    kernel = create_kernel(asms, gen_config)
+    kfunc = kernel.compile()
+    kfunc(inp=inp, outp=outp)
+
+    if np.issubdtype(from_type, np.floating) and not np.issubdtype(
+        to_type, np.floating
+    ):
+        # rounding mode depends on platform
+        try:
+            xp.testing.assert_array_equal(outp, truncated)
+        except AssertionError:
+            xp.testing.assert_array_equal(outp, rounded)
+    else:
+        xp.testing.assert_array_equal(outp, truncated)
--- a/tests/nbackend/transformations/test_ast_vectorizer.py
+++ b/tests/nbackend/transformations/test_ast_vectorizer.py
 import sympy as sp
 import pytest

-from pystencils import Assignment, TypedSymbol, fields, FieldType
+from pystencils import Assignment, TypedSymbol, fields, FieldType, make_slice
 from pystencils.sympyextensions import CastFunc, mem_acc
 from pystencils.sympyextensions.pointers import AddressOf

@@ -18,19 +18,25 @@ from pystencils.backend.transformations import (
    AstVectorizer,
 )
 from pystencils.backend.ast import dfs_preorder
-from pystencils.backend.ast.structural import PsBlock, PsDeclaration, PsAssignment
+from pystencils.backend.ast.structural import (
+    PsBlock,
+    PsDeclaration,
+    PsAssignment,
+    PsLoop,
+)
 from pystencils.backend.ast.expressions import (
    PsSymbolExpr,
    PsConstantExpr,
    PsExpression,
    PsCast,
    PsMemAcc,
-    PsCall
+    PsCall,
+    PsSubscript,
 )
 from pystencils.backend.functions import CFunction
 from pystencils.backend.ast.vector import PsVecBroadcast, PsVecMemAcc
 from pystencils.backend.exceptions import VectorizationError
-from pystencils.types import PsVectorType, deconstify, create_type
+from pystencils.types import PsArrayType, PsVectorType, deconstify, create_type


 def test_vectorize_expressions():
@@ -56,7 +62,9 @@ def test_vectorize_expressions():
        factory.parse_sympy(-x * y + 13 * z - 4 * (x / w) * (x + z)),
        factory.parse_sympy(sp.sin(x + z) - sp.cos(w)),
        factory.parse_sympy(y**2 - x**2),
-        typify(- factory.parse_sympy(x / (w**2))),  # place the negation outside, since SymPy would remove it
+        typify(
+            -factory.parse_sympy(x / (w**2))
+        ),  # place the negation outside, since SymPy would remove it
        factory.parse_sympy(13 + (1 / w) - sp.exp(x) * 24),
    ]:
        vec_expr = vectorize.visit(expr, vc)
@@ -239,7 +247,31 @@ def test_reject_symbol_assignments():
    with pytest.raises(VectorizationError):
        _ = vectorize.visit(asm, vc)

-    
+
+def test_vectorize_assignments():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+
+    x, y = sp.symbols("x, y")
+
+    vectorize = AstVectorizer(ctx)
+
+    axis = VectorizationAxis(
+        ctx.get_symbol("ctr", ctx.index_dtype),
+    )
+    vc = VectorizationContext(ctx, 4, axis)
+
+    decl = PsDeclaration(factory.parse_sympy(x), factory.parse_sympy(sp.sympify(0)))
+    asm = PsAssignment(factory.parse_sympy(x), factory.parse_sympy(3 + y))
+    ast = PsBlock([decl, asm])
+
+    vec_ast = vectorize.visit(ast, vc)
+    vec_asm = vec_ast.statements[1]
+
+    assert isinstance(vec_asm, PsAssignment)
+    assert isinstance(vec_asm.lhs.symbol.dtype, PsVectorType)
+
+
 def test_vectorize_memory_assignments():
    ctx = KernelCreationContext()
    factory = AstFactory(ctx)
@@ -260,7 +292,7 @@ def test_vectorize_memory_assignments():
    asm = typify(
        PsAssignment(
            factory.parse_sympy(mem_acc(ptr, 3 * ctr + 2)),
-            factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3))
+            factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)),
        )
    )

@@ -303,7 +335,7 @@ def test_invalid_memory_assignments():
    asm = typify(
        PsAssignment(
            factory.parse_sympy(mem_acc(ptr, 3 * i + 2)),
-            factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3))
+            factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)),
        )
    )

@@ -376,7 +408,9 @@ def test_vectorize_mem_acc():
    assert vec_acc.vector_entries == 4

    #   Even more complex affine
-    idx = - factory.parse_index(ctr) / factory.parse_index(i) - factory.parse_index(ctr) * factory.parse_index(j)
+    idx = -factory.parse_index(ctr) / factory.parse_index(i) - factory.parse_index(
+        ctr
+    ) * factory.parse_index(j)
    acc = typify(PsMemAcc(factory.parse_sympy(ptr), idx))
    assert isinstance(acc, PsMemAcc)

@@ -386,11 +420,15 @@ def test_vectorize_mem_acc():
    assert vec_acc.pointer.structurally_equal(acc.pointer)
    assert vec_acc.offset is not acc.offset
    assert vec_acc.offset.structurally_equal(acc.offset)
-    assert vec_acc.stride.structurally_equal(factory.parse_index(-1) / factory.parse_index(i) - factory.parse_index(j))
+    assert vec_acc.stride.structurally_equal(
+        factory.parse_index(-1) / factory.parse_index(i) - factory.parse_index(j)
+    )
    assert vec_acc.vector_entries == 4

    #   Mixture of strides in affine and axis
-    vc = VectorizationContext(ctx, 4, VectorizationAxis(ctx.get_symbol("ctr"), step=factory.parse_index(3)))
+    vc = VectorizationContext(
+        ctx, 4, VectorizationAxis(ctx.get_symbol("ctr"), step=factory.parse_index(3))
+    )

    acc = factory.parse_sympy(mem_acc(ptr, 3 * i + 5 * ctr))
    assert isinstance(acc, PsMemAcc)
@@ -421,7 +459,9 @@ def test_invalid_mem_acc():
    ptr = TypedSymbol("ptr", create_type("float64 *"))

    #   Non-symbol pointer
-    acc = factory.parse_sympy(mem_acc(AddressOf(mem_acc(ptr, 10)), 3 * i + ctr * (3 + ctr)))
+    acc = factory.parse_sympy(
+        mem_acc(AddressOf(mem_acc(ptr, 10)), 3 * i + ctr * (3 + ctr))
+    )

    with pytest.raises(VectorizationError):
        _ = vectorize.visit(acc, vc)
@@ -503,3 +543,107 @@ def test_invalid_buffer_acc():

    with pytest.raises(VectorizationError):
        _ = vectorize.visit(acc, vc)
+
+
+def test_vectorize_subscript():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+    vectorize = AstVectorizer(ctx)
+
+    ctr = ctx.get_symbol("ctr", ctx.index_dtype)
+
+    axis = VectorizationAxis(ctr)
+    vc = VectorizationContext(ctx, 4, axis)
+
+    acc = PsSubscript(
+        PsExpression.make(ctx.get_symbol("arr", PsArrayType(ctx.default_dtype, 42))),
+        [PsExpression.make(ctx.get_symbol("i", ctx.index_dtype))],
+    )  # independent of vectorization axis
+
+    vec_acc = vectorize.visit(factory._typify(acc), vc)
+    assert isinstance(vec_acc, PsVecBroadcast)
+    assert isinstance(vec_acc.operand, PsSubscript)
+
+
+def test_invalid_subscript():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+    vectorize = AstVectorizer(ctx)
+
+    ctr = ctx.get_symbol("ctr", ctx.index_dtype)
+
+    axis = VectorizationAxis(ctr)
+    vc = VectorizationContext(ctx, 4, axis)
+
+    acc = PsSubscript(
+        PsExpression.make(ctx.get_symbol("arr", PsArrayType(ctx.default_dtype, 42))),
+        [PsExpression.make(ctr)],  # depends on vectorization axis
+    )
+
+    with pytest.raises(VectorizationError):
+        _ = vectorize.visit(factory._typify(acc), vc)
+
+
+def test_vectorize_nested_loop():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+    vectorize = AstVectorizer(ctx)
+
+    ctr = ctx.get_symbol("i", ctx.index_dtype)
+
+    axis = VectorizationAxis(ctr)
+    vc = VectorizationContext(ctx, 4, axis)
+
+    ast = factory.loop_nest(
+        ("i", "j"),
+        make_slice[:8, :8],  # inner loop does not depend on vectorization axis
+        PsBlock(
+            [
+                PsDeclaration(
+                    PsExpression.make(ctx.get_symbol("x", ctx.default_dtype)),
+                    PsExpression.make(PsConstant(42, ctx.default_dtype)),
+                )
+            ]
+        ),
+    )
+
+    vec_ast = vectorize.visit(ast, vc)
+    inner_loop = next(
+        dfs_preorder(
+            vec_ast,
+            lambda node: isinstance(node, PsLoop) and node.counter.symbol.name == "j",
+        )
+    )
+    decl = inner_loop.body.statements[0]
+
+    assert inner_loop.step.structurally_equal(
+        PsExpression.make(PsConstant(1, ctx.index_dtype))
+    )
+    assert isinstance(decl.lhs.symbol.dtype, PsVectorType)
+
+
+def test_invalid_nested_loop():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+    vectorize = AstVectorizer(ctx)
+
+    ctr = ctx.get_symbol("i", ctx.index_dtype)
+
+    axis = VectorizationAxis(ctr)
+    vc = VectorizationContext(ctx, 4, axis)
+
+    ast = factory.loop_nest(
+        ("i", "j"),
+        make_slice[:8, :ctr],  # inner loop depends on vectorization axis
+        PsBlock(
+            [
+                PsDeclaration(
+                    PsExpression.make(ctx.get_symbol("x", ctx.default_dtype)),
+                    PsExpression.make(PsConstant(42, ctx.default_dtype)),
+                )
+            ]
+        ),
+    )
+
+    with pytest.raises(VectorizationError):
+        _ = vectorize.visit(ast, vc)
No results found