Daniel Bauer · Daniel Bauer
--- a/src/pystencils/backend/platforms/x86.py

+ 40

− 4
+++ b/src/pystencils/backend/platforms/x86.py

+ 40

− 4
 @@ -16,6 +16,7 @@ from ..ast.expressions import (
    PsConstantExpr,
    PsCast,
    PsCall,
+    PsLiteral,
 )
 from ..ast.vector import PsVecMemAcc, PsVecBroadcast
 from ...types import PsCustomType, PsVectorType, PsPointerType
 @@ -109,9 +110,28 @@ class X86VectorCpu(GenericVectorCpu):
    https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html.
    """

-    def __init__(self, ctx: KernelCreationContext, vector_arch: X86VectorArch):
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        vector_arch: X86VectorArch,
+        use_builtin_convertvector: bool = False,
+    ):
+        """Initialize an `X86VectorCpu`.
+
+        Args:
+            ctx: The kernel creation context
+            vector_arch: The architecture of the target machine
+            use_builtin_convertvector: If `True`, type conversions between
+                SIMD vectors use the compiler builtin `__builtin_convertvector`
+                instead of instrinsics. It is supported by Clang >= 3.7, GCC
+                >= 9.1, and ICX. Not supported by ICC or MSVC. Activate if you
+                need type conversions not natively supported by your CPU, e.g.
+                conversion from 64bit integer to double on an AVX machine.
+                Defaults to `False`.
+        """
        super().__init__(ctx)
        self._vector_arch = vector_arch
+        self._use_builtin_convertvector = use_builtin_convertvector

    @property
    def vector_arch(self) -> X86VectorArch:
 @@ -159,12 +179,28 @@ class X86VectorCpu(GenericVectorCpu):
        self, expr: PsExpression, operands: Sequence[PsExpression]
    ) -> PsExpression:
        match expr:
+            case PsCast() if self._use_builtin_convertvector:
+                vtype = expr.get_dtype()
+                assert isinstance(vtype, PsVectorType)
+                assert len(operands) == 1
+                op = operands[0]
+
+                rtype = self._vector_arch.intrin_type(vtype)
+                target_type_literal = PsExpression.make(PsLiteral(rtype.name, rtype))
+
+                func = CFunction(
+                    "__builtin_convertvector", (op.get_dtype(), rtype), vtype
+                )
+                intrinsic = func(op, target_type_literal)
            case PsUnOp() | PsBinOp():
                func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
-                return func(*operands)
+                intrinsic = func(*operands)
            case _:
                raise MaterializationError(f"Cannot map {type(expr)} to x86 intrinsic")

+        intrinsic.dtype = expr.dtype
+        return intrinsic
+
    def math_func_intrinsic(
        self, expr: PsCall, operands: Sequence[PsExpression]
    ) -> PsExpression:
 @@ -332,7 +368,8 @@ def _x86_op_intrin(
                raise MaterializationError(
                    f"Unable to select intrinsic for type conversion: "
                    f"{varch.name} does not support packed conversion from {atype} to {target_type}.\n"
-                    f"    at: {op}"
+                    f"    at: {op}\n"
+                    "Note: You may consider initializing the `X86VectorCpu` with `use_builtin_convertvector=True`."
                )

            match (atype.scalar_type, vtype.scalar_type):
 @@ -351,7 +388,6 @@ def _x86_op_intrin(
                    Fp(),
                    SInt(64),
                ) if varch < X86VectorArch.AVX512:
-                    # TODO: https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                    panic()
                # AVX512 only: cvtepiA_epiT if A > T
                case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: