Skip to content
Snippets Groups Projects

Various extensions to the vectorizer

Merged Daniel Bauer requested to merge hyteg/pystencils:bauerd/vec-extensions into v2.0-dev
1 file
+ 40
4
Compare changes
  • Side-by-side
  • Inline
@@ -16,6 +16,7 @@ from ..ast.expressions import (
PsConstantExpr,
PsCast,
PsCall,
PsLiteral,
)
from ..ast.vector import PsVecMemAcc, PsVecBroadcast
from ...types import PsCustomType, PsVectorType, PsPointerType
@@ -109,9 +110,28 @@ class X86VectorCpu(GenericVectorCpu):
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html.
"""
def __init__(self, ctx: KernelCreationContext, vector_arch: X86VectorArch):
def __init__(
self,
ctx: KernelCreationContext,
vector_arch: X86VectorArch,
use_builtin_convertvector: bool = False,
):
"""Initialize an `X86VectorCpu`.
Args:
ctx: The kernel creation context
vector_arch: The architecture of the target machine
use_builtin_convertvector: If `True`, type conversions between
SIMD vectors use the compiler builtin `__builtin_convertvector`
instead of instrinsics. It is supported by Clang >= 3.7, GCC
>= 9.1, and ICX. Not supported by ICC or MSVC. Activate if you
need type conversions not natively supported by your CPU, e.g.
conversion from 64bit integer to double on an AVX machine.
Defaults to `False`.
"""
super().__init__(ctx)
self._vector_arch = vector_arch
self._use_builtin_convertvector = use_builtin_convertvector
@property
def vector_arch(self) -> X86VectorArch:
@@ -159,12 +179,28 @@ class X86VectorCpu(GenericVectorCpu):
self, expr: PsExpression, operands: Sequence[PsExpression]
) -> PsExpression:
match expr:
case PsCast() if self._use_builtin_convertvector:
vtype = expr.get_dtype()
assert isinstance(vtype, PsVectorType)
assert len(operands) == 1
op = operands[0]
rtype = self._vector_arch.intrin_type(vtype)
target_type_literal = PsExpression.make(PsLiteral(rtype.name, rtype))
func = CFunction(
"__builtin_convertvector", (op.get_dtype(), rtype), vtype
)
intrinsic = func(op, target_type_literal)
case PsUnOp() | PsBinOp():
func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
return func(*operands)
intrinsic = func(*operands)
case _:
raise MaterializationError(f"Cannot map {type(expr)} to x86 intrinsic")
intrinsic.dtype = expr.dtype
return intrinsic
def math_func_intrinsic(
self, expr: PsCall, operands: Sequence[PsExpression]
) -> PsExpression:
@@ -332,7 +368,8 @@ def _x86_op_intrin(
raise MaterializationError(
f"Unable to select intrinsic for type conversion: "
f"{varch.name} does not support packed conversion from {atype} to {target_type}.\n"
f" at: {op}"
f" at: {op}\n"
"Note: You may consider initializing the `X86VectorCpu` with `use_builtin_convertvector=True`."
)
match (atype.scalar_type, vtype.scalar_type):
@@ -351,7 +388,6 @@ def _x86_op_intrin(
Fp(),
SInt(64),
) if varch < X86VectorArch.AVX512:
# TODO: https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
panic()
# AVX512 only: cvtepiA_epiT if A > T
case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512:
Loading