diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 94fbfa0e1dce6aaf747134b21d946eac4dc023a6..b6d7dd551bb70b799241cb0ca6257fd48b57a3d6 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -57,7 +57,7 @@ class GenericCpu(Platform): def select_function(self, call: PsCall) -> PsExpression: assert isinstance(call.function, PsMathFunction) - + func = call.function.func dtype = call.get_dtype() arg_types = (dtype,) * func.num_args @@ -87,13 +87,13 @@ class GenericCpu(Platform): call.function = cfunc return call - + if isinstance(dtype, PsIntegerType): match func: case MathFunctions.Abs: zero = PsExpression.make(PsConstant(0, dtype)) arg = call.args[0] - return PsTernary(PsGe(arg, zero), arg, - arg) + return PsTernary(PsGe(arg, zero), arg, -arg) case MathFunctions.Min: arg1, arg2 = call.args return PsTernary(PsLe(arg1, arg2), arg1, arg2) @@ -131,7 +131,10 @@ class GenericCpu(Platform): PsLookup( PsBufferAcc( ispace.index_list.base_pointer, - (PsExpression.make(ispace.sparse_counter), factory.parse_index(0)), + ( + PsExpression.make(ispace.sparse_counter), + factory.parse_index(0), + ), ), coord.name, ), @@ -158,26 +161,33 @@ class GenericVectorCpu(GenericCpu, ABC): @abstractmethod def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: """Return the intrinsic vector type for the given generic vector type, - or raise an `MaterializationError` if type is not supported.""" + or raise a `MaterializationError` if type is not supported.""" @abstractmethod def constant_intrinsic(self, c: PsConstant) -> PsExpression: """Return an expression that initializes a constant vector, - or raise an `MaterializationError` if not supported.""" + or raise a `MaterializationError` if not supported.""" @abstractmethod def op_intrinsic( self, expr: PsExpression, operands: Sequence[PsExpression] ) -> PsExpression: """Return an expression intrinsically invoking the given operation - or raise an `MaterializationError` if not supported.""" + or raise a `MaterializationError` if not supported.""" + + @abstractmethod + def math_func_intrinsic( + self, expr: PsCall, operands: Sequence[PsExpression] + ) -> PsExpression: + """Return an expression intrinsically invoking the given mathematical + function or raise a `MaterializationError` if not supported.""" @abstractmethod def vector_load(self, acc: PsVecMemAcc) -> PsExpression: """Return an expression intrinsically performing a vector load, - or raise an `MaterializationError` if not supported.""" + or raise a `MaterializationError` if not supported.""" @abstractmethod def vector_store(self, acc: PsVecMemAcc, arg: PsExpression) -> PsExpression: """Return an expression intrinsically performing a vector store, - or raise an `MaterializationError` if not supported.""" + or raise a `MaterializationError` if not supported.""" diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 3e74e89282b44011295c742d96b35ce63401bc67..7d2fe650fc23a54ea301817d57a3816b6780bd85 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -13,7 +13,9 @@ from ..ast.expressions import ( PsSub, PsMul, PsDiv, - PsConstantExpr + PsConstantExpr, + PsCast, + PsCall, ) from ..ast.vector import PsVecMemAcc, PsVecBroadcast from ...types import PsCustomType, PsVectorType, PsPointerType @@ -23,15 +25,15 @@ from ..exceptions import MaterializationError from .generic_cpu import GenericVectorCpu from ..kernelcreation import KernelCreationContext -from ...types.quick import Fp, SInt -from ..functions import CFunction +from ...types.quick import Fp, UInt, SInt +from ..functions import CFunction, PsMathFunction, MathFunctions class X86VectorArch(Enum): SSE = 128 AVX = 256 AVX512 = 512 - AVX512_FP16 = AVX512 + 1 # TODO improve modelling? + AVX512_FP16 = AVX512 + 1 # TODO improve modelling? def __ge__(self, other: X86VectorArch) -> bool: return self.value >= other.value @@ -78,7 +80,7 @@ class X86VectorArch(Enum): ) return suffix - + def intrin_type(self, vtype: PsVectorType): scalar_type = vtype.scalar_type match scalar_type: @@ -96,9 +98,7 @@ class X86VectorArch(Enum): ) if vtype.width > self.max_vector_width: - raise MaterializationError( - f"x86/{self} does not support {vtype}" - ) + raise MaterializationError(f"x86/{self} does not support {vtype}") return PsCustomType(f"__m{vtype.width}{suffix}") @@ -161,26 +161,124 @@ class X86VectorCpu(GenericVectorCpu): match expr: case PsUnOp() | PsBinOp(): func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype()) - return func(*operands) + intrinsic = func(*operands) + intrinsic.dtype = func.return_type + return intrinsic case _: raise MaterializationError(f"Cannot map {type(expr)} to x86 intrinsic") + def math_func_intrinsic( + self, expr: PsCall, operands: Sequence[PsExpression] + ) -> PsExpression: + assert isinstance(expr.function, PsMathFunction) + + vtype = expr.get_dtype() + assert isinstance(vtype, PsVectorType) + + prefix = self._vector_arch.intrin_prefix(vtype) + suffix = self._vector_arch.intrin_suffix(vtype) + rtype = atype = self._vector_arch.intrin_type(vtype) + + match expr.function.func: + case ( + MathFunctions.Exp + | MathFunctions.Log + | MathFunctions.Sin + | MathFunctions.Cos + | MathFunctions.Tan + | MathFunctions.Sinh + | MathFunctions.Cosh + | MathFunctions.ASin + | MathFunctions.ACos + | MathFunctions.ATan + | MathFunctions.ATan2 + | MathFunctions.Pow + ): + raise MaterializationError( + "Trigonometry, exp, log, and pow require SVML." + ) + + case MathFunctions.Floor | MathFunctions.Ceil if vtype.is_float(): + opstr = expr.function.func.function_name + if vtype.width > 256: + raise MaterializationError("512bit ceil/floor require SVML.") + + case MathFunctions.Min | MathFunctions.Max: + opstr = expr.function.func.function_name + if ( + vtype.is_int() + and vtype.scalar_type.width == 64 + and self._vector_arch < X86VectorArch.AVX512 + ): + raise MaterializationError( + "64bit integer (signed and unsigned) min/max intrinsics require AVX512." + ) + + case MathFunctions.Abs: + assert len(operands) == 1, "abs takes exactly one argument." + op = operands[0] + + match vtype.scalar_type: + case UInt(): + return op + + case SInt(width): + opstr = expr.function.func.function_name + if width == 64 and self._vector_arch < X86VectorArch.AVX512: + raise MaterializationError( + "64bit integer abs intrinsic requires AVX512." + ) + + case Fp(): + neg_zero = self.constant_intrinsic(PsConstant(-0.0, vtype)) + + opstr = "andnot" + func = CFunction( + f"{prefix}_{opstr}_{suffix}", (atype,) * 2, rtype + ) + + return func(neg_zero, op) + + case _: + raise MaterializationError( + f"x86/{self} does not support {expr.function.func.function_name} on type {vtype}." + ) + + if expr.function.func in [ + MathFunctions.ATan2, + MathFunctions.Min, + MathFunctions.Max, + ]: + num_args = 2 + else: + num_args = 1 + + func = CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) + return func(*operands) + def vector_load(self, acc: PsVecMemAcc) -> PsExpression: if acc.stride is None: - load_func = _x86_packed_load(self._vector_arch, acc.dtype, False) - return load_func( - PsAddressOf(PsMemAcc(acc.pointer, acc.offset)) - ) + load_func, addr_type = _x86_packed_load(self._vector_arch, acc.dtype, False) + addr: PsExpression = PsAddressOf(PsMemAcc(acc.pointer, acc.offset)) + if addr_type: + addr = PsCast(addr_type, addr) + intrinsic = load_func(addr) + intrinsic.dtype = load_func.return_type + return intrinsic else: raise NotImplementedError("Gather loads not implemented yet.") def vector_store(self, acc: PsVecMemAcc, arg: PsExpression) -> PsExpression: if acc.stride is None: - store_func = _x86_packed_store(self._vector_arch, acc.dtype, False) - return store_func( - PsAddressOf(PsMemAcc(acc.pointer, acc.offset)), - arg, + store_func, addr_type = _x86_packed_store( + self._vector_arch, acc.dtype, False ) + addr: PsExpression = PsAddressOf(PsMemAcc(acc.pointer, acc.offset)) + if addr_type: + addr = PsCast(addr_type, addr) + intrinsic = store_func(addr, arg) + intrinsic.dtype = store_func.return_type + return intrinsic else: raise NotImplementedError("Scatter stores not implemented yet.") @@ -188,26 +286,46 @@ class X86VectorCpu(GenericVectorCpu): @cache def _x86_packed_load( varch: X86VectorArch, vtype: PsVectorType, aligned: bool -) -> CFunction: +) -> tuple[CFunction, PsPointerType | None]: prefix = varch.intrin_prefix(vtype) - suffix = varch.intrin_suffix(vtype) ptr_type = PsPointerType(vtype.scalar_type, const=True) - return CFunction( - f"{prefix}_load{'' if aligned else 'u'}_{suffix}", (ptr_type,), vtype + + if isinstance(vtype.scalar_type, SInt): + suffix = f"si{vtype.width}" + addr_type = PsPointerType(varch.intrin_type(vtype)) + else: + suffix = varch.intrin_suffix(vtype) + addr_type = None + + return ( + CFunction( + f"{prefix}_load{'' if aligned else 'u'}_{suffix}", (ptr_type,), vtype + ), + addr_type, ) @cache def _x86_packed_store( varch: X86VectorArch, vtype: PsVectorType, aligned: bool -) -> CFunction: +) -> tuple[CFunction, PsPointerType | None]: prefix = varch.intrin_prefix(vtype) - suffix = varch.intrin_suffix(vtype) ptr_type = PsPointerType(vtype.scalar_type, const=True) - return CFunction( - f"{prefix}_store{'' if aligned else 'u'}_{suffix}", - (ptr_type, vtype), - PsCustomType("void"), + + if isinstance(vtype.scalar_type, SInt): + suffix = f"si{vtype.width}" + addr_type = PsPointerType(varch.intrin_type(vtype)) + else: + suffix = varch.intrin_suffix(vtype) + addr_type = None + + return ( + CFunction( + f"{prefix}_store{'' if aligned else 'u'}_{suffix}", + (ptr_type, vtype), + PsCustomType("void"), + ), + addr_type, ) @@ -223,7 +341,7 @@ def _x86_op_intrin( case PsVecBroadcast(): opstr = "set1" if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: - suffix += "x" + suffix += "x" atype = vtype.scalar_type case PsAdd(): opstr = "add" @@ -239,8 +357,52 @@ def _x86_op_intrin( opstr = "mul" case PsDiv(): opstr = "div" + case PsCast(target_type, arg): + atype = arg.dtype + widest_type = max(vtype, atype, key=lambda t: t.width) + + assert target_type == vtype, "type mismatch" + assert isinstance(atype, PsVectorType) + + def panic(detail: str = ""): + raise MaterializationError( + f"Unable to select intrinsic for type conversion: " + f"{varch.name} does not support packed conversion from {atype} to {target_type}. {detail}\n" + f" at: {op}" + ) + + if atype == vtype: + panic("Use `EliminateConstants` to eliminate trivial casts.") + + match (atype.scalar_type, vtype.scalar_type): + # Not supported: cvtepi8_pX, cvtpX_epi8, cvtepi16_p[sd], cvtp[sd]_epi16 + case ( + (SInt(8), Fp()) + | (Fp(), SInt(8)) + | (SInt(16), Fp(32)) + | (SInt(16), Fp(64)) + | (Fp(32), SInt(16)) + | (Fp(64), SInt(16)) + ): + panic() + # AVX512 only: cvtepi64_pX, cvtpX_epi64 + case (SInt(64), Fp()) | ( + Fp(), + SInt(64), + ) if varch < X86VectorArch.AVX512: + panic() + # AVX512 only: cvtepiA_epiT if A > T + case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: + panic() + + case _: + prefix = varch.intrin_prefix(widest_type) + opstr = f"cvt{varch.intrin_suffix(atype)}" + case _: - raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}") + raise MaterializationError( + f"Unable to select operation intrinsic for {type(op)}" + ) num_args = 1 if isinstance(op, PsUnOp) else 2 return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) diff --git a/src/pystencils/backend/transformations/ast_vectorizer.py b/src/pystencils/backend/transformations/ast_vectorizer.py index d0792fc366fa716f3c81b8f35adb4ceddb460d78..ab4401f9ca0142d9cfeec258eeb34fb2a7f6e8eb 100644 --- a/src/pystencils/backend/transformations/ast_vectorizer.py +++ b/src/pystencils/backend/transformations/ast_vectorizer.py @@ -1,5 +1,6 @@ from __future__ import annotations -from typing import overload +from textwrap import indent +from typing import cast, overload from dataclasses import dataclass @@ -11,7 +12,13 @@ from ..constants import PsConstant from ..functions import PsMathFunction from ..ast import PsAstNode -from ..ast.structural import PsBlock, PsDeclaration, PsAssignment +from ..ast.structural import ( + PsBlock, + PsDeclaration, + PsAssignment, + PsLoop, + PsEmptyLeafMixIn, +) from ..ast.expressions import ( PsExpression, PsAddressOf, @@ -24,6 +31,7 @@ from ..ast.expressions import ( PsCall, PsMemAcc, PsBufferAcc, + PsSubscript, PsAdd, PsMul, PsSub, @@ -148,6 +156,10 @@ class VectorizationContext: ) return PsVectorType(scalar_type, self._lanes) + def axis_ctr_dependees(self, symbols: set[PsSymbol]) -> set[PsSymbol]: + """Returns all symbols in `symbols` that depend on the axis counter.""" + return symbols & (self.vectorized_symbols.keys() | {self.axis.counter}) + @dataclass class Affine: @@ -246,7 +258,7 @@ class AstVectorizer: def __call__(self, node: PsAstNode, vc: VectorizationContext) -> PsAstNode: """Perform subtree vectorization. - + Args: node: Root of the subtree that should be vectorized vc: Object describing the current vectorization context @@ -273,6 +285,14 @@ class AstVectorizer: return PsDeclaration(vec_lhs, vec_rhs) case PsAssignment(lhs, rhs): + if ( + isinstance(lhs, PsSymbolExpr) + and lhs.symbol in vc.vectorized_symbols + ): + return PsAssignment( + self.visit_expr(lhs, vc), self.visit_expr(rhs, vc) + ) + if not isinstance(lhs, (PsMemAcc, PsBufferAcc)): raise VectorizationError(f"Unable to vectorize assignment to {lhs}") @@ -286,6 +306,29 @@ class AstVectorizer: rhs_vec = self.visit_expr(rhs, vc) return PsAssignment(lhs_vec, rhs_vec) + case PsLoop(counter, start, stop, step, body): + # Check that loop bounds are lane-invariant + free_symbols = ( + self._collect_symbols(start) + | self._collect_symbols(stop) + | self._collect_symbols(step) + ) + vec_dependencies = vc.axis_ctr_dependees(free_symbols) + if vec_dependencies: + raise VectorizationError( + "Unable to vectorize loop depending on vectorized symbols:\n" + f" Offending dependencies:\n" + f" {vec_dependencies}\n" + f" Found in loop:\n" + f"{indent(str(node), ' ')}" + ) + + vectorized_body = cast(PsBlock, self.visit(body, vc)) + return PsLoop(counter, start, stop, step, vectorized_body) + + case PsEmptyLeafMixIn(): + return node + case _: raise NotImplementedError(f"Vectorization of {node} is not implemented") @@ -426,6 +469,23 @@ class AstVectorizer: # Buffer access is lane-invariant vec_expr = PsVecBroadcast(vc.lanes, expr.clone()) + case PsSubscript(array, index): + # Check that array expression and indices are lane-invariant + free_symbols = self._collect_symbols(array).union( + *[self._collect_symbols(i) for i in index] + ) + vec_dependencies = vc.axis_ctr_dependees(free_symbols) + if vec_dependencies: + raise VectorizationError( + "Unable to vectorize array subscript depending on vectorized symbols:\n" + f" Offending dependencies:\n" + f" {vec_dependencies}\n" + f" Found in expression:\n" + f"{indent(str(expr), ' ')}" + ) + + vec_expr = PsVecBroadcast(vc.lanes, expr.clone()) + case _: raise NotImplementedError( f"Vectorization of {type(expr)} is not implemented" diff --git a/src/pystencils/backend/transformations/eliminate_constants.py b/src/pystencils/backend/transformations/eliminate_constants.py index d6f13bf078ef3f70e4098cb1d787fb65a62db7ec..ab1cabc557a88b03766e6a9fb2ab44a84a5711da 100644 --- a/src/pystencils/backend/transformations/eliminate_constants.py +++ b/src/pystencils/backend/transformations/eliminate_constants.py @@ -40,13 +40,7 @@ from ..ast.util import AstEqWrapper from ..constants import PsConstant from ..memory import PsSymbol from ..functions import PsMathFunction -from ...types import ( - PsNumericType, - PsBoolType, - PsScalarType, - PsVectorType, - constify -) +from ...types import PsNumericType, PsBoolType, PsScalarType, PsVectorType, constify __all__ = ["EliminateConstants"] @@ -261,6 +255,9 @@ class EliminateConstants: assert isinstance(target_type, PsNumericType) return PsConstantExpr(c.reinterpret_as(target_type)), True + case PsCast(target_type, op) if target_type == op.get_dtype(): + return op, all(subtree_constness) + case PsVecBroadcast(lanes, PsConstantExpr(c)): scalar_type = c.get_dtype() assert isinstance(scalar_type, PsScalarType) @@ -358,7 +355,10 @@ class EliminateConstants: from ...utils import c_intdiv folded = PsConstant(c_intdiv(v1, v2), dtype) - elif isinstance(dtype, PsNumericType) and dtype.is_float(): + elif ( + isinstance(dtype, PsNumericType) + and dtype.is_float() + ): folded = PsConstant(v1 / v2, dtype) if folded is not None: diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index a0eb5ce8f94510a18e826d1defa423b60e0d808d..060192810a7ccb9ab9ed13f64dd7948791078ea4 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -4,11 +4,12 @@ from typing import cast from ..kernelcreation import KernelCreationContext from ..memory import PsSymbol from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement -from ..ast.expressions import PsExpression -from ...types import PsVectorType, constify, deconstify +from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral +from ...types import PsCustomType, PsVectorType, constify, deconstify from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp from ..ast.vector import PsVecMemAcc from ..exceptions import MaterializationError +from ..functions import CFunction, PsMathFunction from ..platforms import GenericVectorCpu @@ -39,22 +40,34 @@ class SelectionContext: class SelectIntrinsics: """Lower IR vector types to intrinsic vector types, and IR vector operations to intrinsic vector operations. - + This transformation will replace all vectorial IR elements by conforming implementations using compiler intrinsics for the given execution platform. Args: ctx: The current kernel creation context platform: Platform object representing the target hardware, which provides the intrinsics + use_builtin_convertvector: If `True`, type conversions between SIMD + vectors use the compiler builtin ``__builtin_convertvector`` + instead of instrinsics. It is supported by Clang >= 3.7, GCC >= 9.1, + and ICX. Not supported by ICC or MSVC. Activate if you need type + conversions not natively supported by your CPU, e.g. conversion from + 64bit integer to double on an x86 AVX machine. Defaults to `False`. Raises: MaterializationError: If a vector type or operation cannot be represented by intrinsics on the given platform """ - def __init__(self, ctx: KernelCreationContext, platform: GenericVectorCpu): + def __init__( + self, + ctx: KernelCreationContext, + platform: GenericVectorCpu, + use_builtin_convertvector: bool = False, + ): self._ctx = ctx self._platform = platform + self._use_builtin_convertvector = use_builtin_convertvector def __call__(self, node: PsAstNode) -> PsAstNode: return self.visit(node, SelectionContext(self._ctx, self._platform)) @@ -68,11 +81,11 @@ class SelectIntrinsics: lhs_new = cast(PsSymbolExpr, self.visit_expr(lhs, sc)) rhs_new = self.visit_expr(rhs, sc) return PsDeclaration(lhs_new, rhs_new) - - case PsAssignment(lhs, rhs) if isinstance(lhs, PsVecMemAcc): + + case PsAssignment(lhs, rhs) if isinstance(lhs, PsVecMemAcc): new_rhs = self.visit_expr(rhs, sc) return PsStatement(self._platform.vector_store(lhs, new_rhs)) - + case _: node.children = [self.visit(c, sc) for c in node.children] @@ -89,6 +102,22 @@ class SelectIntrinsics: case PsConstantExpr(c): return self._platform.constant_intrinsic(c) + case PsCast(target_type, operand) if self._use_builtin_convertvector: + assert isinstance(target_type, PsVectorType) + op = self.visit_expr(operand, sc) + + rtype = PsCustomType( + f"{target_type.scalar_type.c_string()} __attribute__((__vector_size__({target_type.itemsize})))" + ) + target_type_literal = PsExpression.make(PsLiteral(rtype.name, rtype)) + + func = CFunction( + "__builtin_convertvector", (op.get_dtype(), rtype), target_type + ) + intrinsic = func(op, target_type_literal) + intrinsic.dtype = func.return_type + return intrinsic + case PsUnOp(operand): op = self.visit_expr(operand, sc) return self._platform.op_intrinsic(expr, [op]) @@ -102,6 +131,10 @@ class SelectIntrinsics: case PsVecMemAcc(): return self._platform.vector_load(expr) + case PsCall(function, args) if isinstance(function, PsMathFunction): + arguments = [self.visit_expr(a, sc) for a in args] + return self._platform.math_func_intrinsic(expr, arguments) + case _: raise MaterializationError( f"Unable to select intrinsic implementation for {expr}" diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index 5d897dd7ebd0415494161b42c2192d4eaf08adbd..b847a8139a8725c9c926b7c12c9556aba3ec6e87 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -115,6 +115,18 @@ class Target(Flag): return avail_targets.pop() else: return Target.GenericCPU + + @staticmethod + def available_targets() -> list[Target]: + targets = [Target.GenericCPU] + try: + import cupy # noqa: F401 + targets.append(Target.CUDA) + except ImportError: + pass + + targets += Target.available_vector_cpu_targets() + return targets @staticmethod def available_vector_cpu_targets() -> list[Target]: diff --git a/tests/fixtures.py b/tests/fixtures.py index 7c95216147d02b7943b0c218af6fbb5b5175559b..706d1afbb3ba2df0d78f36b113009ad7f518c819 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -31,11 +31,13 @@ except ImportError: AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] + @pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS) def target(request) -> ps.Target: """Provides all code generation targets available on the current hardware""" return request.param + @pytest.fixture def gen_config(target: ps.Target): """Default codegen configuration for the current target. @@ -56,6 +58,7 @@ def gen_config(target: ps.Target): return gen_config + @pytest.fixture() def xp(target: ps.Target) -> ModuleType: """Primary array module for the current target. diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py index cab1affcb3a344033458829ba28765a9a1ff8c48..9b7dd2852c73242d8a8549f1a7c39e26024848b2 100644 --- a/tests/kernelcreation/test_functions.py +++ b/tests/kernelcreation/test_functions.py @@ -1,8 +1,17 @@ import sympy as sp import numpy as np import pytest - -from pystencils import fields, create_kernel, CreateKernelConfig, Target, Assignment, Field +from dataclasses import replace +from itertools import product + +from pystencils import ( + fields, + create_kernel, + CreateKernelConfig, + Target, + Assignment, + Field, +) from pystencils.backend.ast import dfs_preorder from pystencils.backend.ast.expressions import PsCall @@ -34,38 +43,42 @@ def binary_function(name, xp): }[name] -@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA)) +AVAIL_TARGETS = Target.available_targets() + + @pytest.mark.parametrize( - "function_name", - ( - "exp", - "log", - "sin", - "cos", - "tan", - "sinh", - "cosh", - "asin", - "acos", - "atan", - "abs", - "floor", - "ceil", - ), + "function_name, target", + list( + product( + ( + "exp", + "log", + "sin", + "cos", + "tan", + "sinh", + "cosh", + "asin", + "acos", + "atan", + ), + [t for t in AVAIL_TARGETS if Target._X86 not in t], + ) + ) + + list( + product( + ["floor", "ceil"], [t for t in AVAIL_TARGETS if Target._AVX512 not in t] + ) + ) + + list(product(["abs"], AVAIL_TARGETS)), ) @pytest.mark.parametrize("dtype", (np.float32, np.float64)) -def test_unary_functions(target, function_name, dtype): - if target == Target.CUDA: - xp = pytest.importorskip("cupy") - else: - xp = np - +def test_unary_functions(gen_config, xp, function_name, dtype): sp_func, xp_func = unary_function(function_name, xp) resolution = np.finfo(dtype).resolution - inp = xp.array( - [[0.1, 0.2, 0.3], [-0.8, -1.6, -12.592], [xp.pi, xp.e, 0.0]], dtype=dtype - ) + # Array size should be larger than eight, such that vectorized kernels don't just run their remainder loop + inp = xp.array([0.1, 0.2, 0.0, -0.8, -1.6, -12.592, xp.pi, xp.e, -0.3], dtype=dtype) outp = xp.zeros_like(inp) reference = xp_func(inp) @@ -74,7 +87,7 @@ def test_unary_functions(target, function_name, dtype): outp_field = inp_field.new_field_with_different_name("outp") asms = [Assignment(outp_field.center(), sp_func(inp_field.center()))] - gen_config = CreateKernelConfig(target=target, default_dtype=dtype) + gen_config = replace(gen_config, default_dtype=dtype) kernel = create_kernel(asms, gen_config) kfunc = kernel.compile() @@ -83,28 +96,26 @@ def test_unary_functions(target, function_name, dtype): xp.testing.assert_allclose(outp, reference, rtol=resolution) -@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA)) -@pytest.mark.parametrize("function_name", ("min", "max", "pow", "atan2")) +@pytest.mark.parametrize( + "function_name,target", + list(product(["min", "max"], AVAIL_TARGETS)) + + list( + product(["pow", "atan2"], [t for t in AVAIL_TARGETS if Target._X86 not in t]) + ), +) @pytest.mark.parametrize("dtype", (np.float32, np.float64)) -def test_binary_functions(target, function_name, dtype): - if target == Target.CUDA: - xp = pytest.importorskip("cupy") - else: - xp = np - - sp_func, np_func = binary_function(function_name, xp) +def test_binary_functions(gen_config, xp, function_name, dtype): + sp_func, xp_func = binary_function(function_name, xp) resolution: dtype = np.finfo(dtype).resolution - inp = xp.array( - [[0.1, 0.2, 0.3], [-0.8, -1.6, -12.592], [xp.pi, xp.e, 0.0]], dtype=dtype - ) + inp = xp.array([0.1, 0.2, 0.3, -0.8, -1.6, -12.592, xp.pi, xp.e, 0.0], dtype=dtype) inp2 = xp.array( - [[3.1, -0.5, 21.409], [11.0, 1.0, -14e3], [2.0 * xp.pi, -xp.e, 0.0]], + [3.1, -0.5, 21.409, 11.0, 1.0, -14e3, 2.0 * xp.pi, -xp.e, 0.0], dtype=dtype, ) outp = xp.zeros_like(inp) - reference = np_func(inp, inp2) + reference = xp_func(inp, inp2) inp_field = Field.create_from_numpy_array("inp", inp) inp2_field = Field.create_from_numpy_array("inp2", inp) @@ -115,7 +126,7 @@ def test_binary_functions(target, function_name, dtype): outp_field.center(), sp_func(inp_field.center(), inp2_field.center()) ) ] - gen_config = CreateKernelConfig(target=target, default_dtype=dtype) + gen_config = replace(gen_config, default_dtype=dtype) kernel = create_kernel(asms, gen_config) kfunc = kernel.compile() @@ -124,26 +135,107 @@ def test_binary_functions(target, function_name, dtype): xp.testing.assert_allclose(outp, reference, rtol=resolution) -@pytest.mark.parametrize('a', [sp.Symbol('a'), fields('a: float64[2d]').center]) +dtype_and_target_for_integer_funcs = pytest.mark.parametrize( + "dtype, target", + list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA])) + + list( + product( + [np.int64], + [ + t + for t in AVAIL_TARGETS + if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA) + ], + ) + ), +) + + +@dtype_and_target_for_integer_funcs +def test_integer_abs(gen_config, xp, dtype): + sp_func, xp_func = unary_function("abs", xp) + + smallest = np.iinfo(dtype).min + largest = np.iinfo(dtype).max + + inp = xp.array([-1, 0, 1, 3, -5, -312, smallest + 1, largest], dtype=dtype) + + outp = xp.zeros_like(inp) + reference = xp_func(inp) + + inp_field = Field.create_from_numpy_array("inp", inp) + outp_field = inp_field.new_field_with_different_name("outp") + + asms = [Assignment(outp_field.center(), sp_func(inp_field.center()))] + gen_config = replace(gen_config, default_dtype=dtype) + + kernel = create_kernel(asms, gen_config) + kfunc = kernel.compile() + kfunc(inp=inp, outp=outp) + + xp.testing.assert_array_equal(outp, reference) + + +@pytest.mark.parametrize("function_name", ("min", "max")) +@dtype_and_target_for_integer_funcs +def test_integer_binary_functions(gen_config, xp, function_name, dtype): + sp_func, xp_func = binary_function(function_name, xp) + + smallest = np.iinfo(dtype).min + largest = np.iinfo(dtype).max + + inp1 = xp.array([-1, 0, 1, 3, -5, -312, smallest + 1, largest], dtype=dtype) + inp2 = xp.array([3, -5, 1, 12, 1, 11, smallest + 42, largest - 3], dtype=dtype) + + outp = xp.zeros_like(inp1) + reference = xp_func(inp1, inp2) + + inp_field = Field.create_from_numpy_array("inp1", inp1) + inp2_field = Field.create_from_numpy_array("inp2", inp2) + outp_field = inp_field.new_field_with_different_name("outp") + + asms = [ + Assignment( + outp_field.center(), sp_func(inp_field.center(), inp2_field.center()) + ) + ] + gen_config = replace(gen_config, default_dtype=dtype) + + kernel = create_kernel(asms, gen_config) + kfunc = kernel.compile() + kfunc(inp1=inp1, inp2=inp2, outp=outp) + + xp.testing.assert_array_equal(outp, reference) + + +@pytest.mark.parametrize("a", [sp.Symbol("a"), fields("a: float64[2d]").center]) def test_avoid_pow(a): - x = fields('x: float64[2d]') + x = fields("x: float64[2d]") - up = Assignment(x.center_vector[0], 2 * a ** 2 / 3) + up = Assignment(x.center_vector[0], 2 * a**2 / 3) func = create_kernel(up) - powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name)) + powers = list( + dfs_preorder( + func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name + ) + ) assert not powers @pytest.mark.xfail(reason="fast_div not available yet") def test_avoid_pow_fast_div(): - x = fields('x: float64[2d]') - a = fields('a: float64[2d]').center + x = fields("x: float64[2d]") + a = fields("a: float64[2d]").center up = Assignment(x.center_vector[0], fast_division(1, (a**2))) func = create_kernel(up, config=CreateKernelConfig(target=Target.GPU)) - powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name)) + powers = list( + dfs_preorder( + func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name + ) + ) assert not powers @@ -151,14 +243,23 @@ def test_avoid_pow_move_constants(): # At the end of the kernel creation the function move_constants_before_loop will be called # This function additionally contains substitutions for symbols with the same value # Thus it simplifies the equations again - x = fields('x: float64[2d]') + x = fields("x: float64[2d]") a, b, c = sp.symbols("a, b, c") - up = [Assignment(a, 0.0), - Assignment(b, 0.0), - Assignment(c, 0.0), - Assignment(x.center_vector[0], a**2/18 - a*b/6 - a/18 + b**2/18 + b/18 - c**2/36)] + up = [ + Assignment(a, 0.0), + Assignment(b, 0.0), + Assignment(c, 0.0), + Assignment( + x.center_vector[0], + a**2 / 18 - a * b / 6 - a / 18 + b**2 / 18 + b / 18 - c**2 / 36, + ), + ] func = create_kernel(up) - powers = list(dfs_preorder(func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name)) + powers = list( + dfs_preorder( + func.body, lambda n: isinstance(n, PsCall) and "pow" in n.function.name + ) + ) assert not powers diff --git a/tests/kernelcreation/test_type_cast.py b/tests/kernelcreation/test_type_cast.py new file mode 100644 index 0000000000000000000000000000000000000000..8ad6d867042ff6e57e5baee8c12ff45bae17e8e4 --- /dev/null +++ b/tests/kernelcreation/test_type_cast.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest +from itertools import product + +from pystencils import ( + create_kernel, + Target, + Assignment, + Field, +) +from pystencils.sympyextensions.typed_sympy import CastFunc + + +AVAIL_TARGETS_NO_SSE = [t for t in Target.available_targets() if Target._SSE not in t] + + +target_and_dtype = pytest.mark.parametrize( + "target, from_type, to_type", + list( + product( + [ + t + for t in AVAIL_TARGETS_NO_SSE + if Target._X86 in t and Target._AVX512 not in t + ], + [np.int32, np.float32, np.float64], + [np.int32, np.float32, np.float64], + ) + ) + + list( + product( + [ + t + for t in AVAIL_TARGETS_NO_SSE + if Target._X86 not in t or Target._AVX512 in t + ], + [np.int32, np.int64, np.float32, np.float64], + [np.int32, np.int64, np.float32, np.float64], + ) + ), +) + + +@target_and_dtype +def test_type_cast(gen_config, xp, from_type, to_type): + if np.issubdtype(from_type, np.floating): + inp = xp.array([-1.25, -0, 1.5, 3, -5, -312, 42, 6.625, -9], dtype=from_type) + else: + inp = xp.array([-1, 0, 1, 3, -5, -312, 42, 6, -9], dtype=from_type) + + outp = xp.zeros_like(inp).astype(to_type) + truncated = inp.astype(to_type) + rounded = xp.round(inp).astype(to_type) + + inp_field = Field.create_from_numpy_array("inp", inp) + outp_field = Field.create_from_numpy_array("outp", outp) + + asms = [Assignment(outp_field.center(), CastFunc(inp_field.center(), to_type))] + + kernel = create_kernel(asms, gen_config) + kfunc = kernel.compile() + kfunc(inp=inp, outp=outp) + + if np.issubdtype(from_type, np.floating) and not np.issubdtype( + to_type, np.floating + ): + # rounding mode depends on platform + try: + xp.testing.assert_array_equal(outp, truncated) + except AssertionError: + xp.testing.assert_array_equal(outp, rounded) + else: + xp.testing.assert_array_equal(outp, truncated) diff --git a/tests/nbackend/transformations/test_ast_vectorizer.py b/tests/nbackend/transformations/test_ast_vectorizer.py index ea425349529c45b94317a98d2d9f305933c9ba60..f92f1e768a6e04c1eb5292612d6406365520bb72 100644 --- a/tests/nbackend/transformations/test_ast_vectorizer.py +++ b/tests/nbackend/transformations/test_ast_vectorizer.py @@ -1,7 +1,7 @@ import sympy as sp import pytest -from pystencils import Assignment, TypedSymbol, fields, FieldType +from pystencils import Assignment, TypedSymbol, fields, FieldType, make_slice from pystencils.sympyextensions import CastFunc, mem_acc from pystencils.sympyextensions.pointers import AddressOf @@ -18,19 +18,25 @@ from pystencils.backend.transformations import ( AstVectorizer, ) from pystencils.backend.ast import dfs_preorder -from pystencils.backend.ast.structural import PsBlock, PsDeclaration, PsAssignment +from pystencils.backend.ast.structural import ( + PsBlock, + PsDeclaration, + PsAssignment, + PsLoop, +) from pystencils.backend.ast.expressions import ( PsSymbolExpr, PsConstantExpr, PsExpression, PsCast, PsMemAcc, - PsCall + PsCall, + PsSubscript, ) from pystencils.backend.functions import CFunction from pystencils.backend.ast.vector import PsVecBroadcast, PsVecMemAcc from pystencils.backend.exceptions import VectorizationError -from pystencils.types import PsVectorType, deconstify, create_type +from pystencils.types import PsArrayType, PsVectorType, deconstify, create_type def test_vectorize_expressions(): @@ -56,7 +62,9 @@ def test_vectorize_expressions(): factory.parse_sympy(-x * y + 13 * z - 4 * (x / w) * (x + z)), factory.parse_sympy(sp.sin(x + z) - sp.cos(w)), factory.parse_sympy(y**2 - x**2), - typify(- factory.parse_sympy(x / (w**2))), # place the negation outside, since SymPy would remove it + typify( + -factory.parse_sympy(x / (w**2)) + ), # place the negation outside, since SymPy would remove it factory.parse_sympy(13 + (1 / w) - sp.exp(x) * 24), ]: vec_expr = vectorize.visit(expr, vc) @@ -239,7 +247,31 @@ def test_reject_symbol_assignments(): with pytest.raises(VectorizationError): _ = vectorize.visit(asm, vc) - + +def test_vectorize_assignments(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + + x, y = sp.symbols("x, y") + + vectorize = AstVectorizer(ctx) + + axis = VectorizationAxis( + ctx.get_symbol("ctr", ctx.index_dtype), + ) + vc = VectorizationContext(ctx, 4, axis) + + decl = PsDeclaration(factory.parse_sympy(x), factory.parse_sympy(sp.sympify(0))) + asm = PsAssignment(factory.parse_sympy(x), factory.parse_sympy(3 + y)) + ast = PsBlock([decl, asm]) + + vec_ast = vectorize.visit(ast, vc) + vec_asm = vec_ast.statements[1] + + assert isinstance(vec_asm, PsAssignment) + assert isinstance(vec_asm.lhs.symbol.dtype, PsVectorType) + + def test_vectorize_memory_assignments(): ctx = KernelCreationContext() factory = AstFactory(ctx) @@ -260,7 +292,7 @@ def test_vectorize_memory_assignments(): asm = typify( PsAssignment( factory.parse_sympy(mem_acc(ptr, 3 * ctr + 2)), - factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)) + factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)), ) ) @@ -303,7 +335,7 @@ def test_invalid_memory_assignments(): asm = typify( PsAssignment( factory.parse_sympy(mem_acc(ptr, 3 * i + 2)), - factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)) + factory.parse_sympy(x + y * mem_acc(ptr, ctr + 3)), ) ) @@ -376,7 +408,9 @@ def test_vectorize_mem_acc(): assert vec_acc.vector_entries == 4 # Even more complex affine - idx = - factory.parse_index(ctr) / factory.parse_index(i) - factory.parse_index(ctr) * factory.parse_index(j) + idx = -factory.parse_index(ctr) / factory.parse_index(i) - factory.parse_index( + ctr + ) * factory.parse_index(j) acc = typify(PsMemAcc(factory.parse_sympy(ptr), idx)) assert isinstance(acc, PsMemAcc) @@ -386,11 +420,15 @@ def test_vectorize_mem_acc(): assert vec_acc.pointer.structurally_equal(acc.pointer) assert vec_acc.offset is not acc.offset assert vec_acc.offset.structurally_equal(acc.offset) - assert vec_acc.stride.structurally_equal(factory.parse_index(-1) / factory.parse_index(i) - factory.parse_index(j)) + assert vec_acc.stride.structurally_equal( + factory.parse_index(-1) / factory.parse_index(i) - factory.parse_index(j) + ) assert vec_acc.vector_entries == 4 # Mixture of strides in affine and axis - vc = VectorizationContext(ctx, 4, VectorizationAxis(ctx.get_symbol("ctr"), step=factory.parse_index(3))) + vc = VectorizationContext( + ctx, 4, VectorizationAxis(ctx.get_symbol("ctr"), step=factory.parse_index(3)) + ) acc = factory.parse_sympy(mem_acc(ptr, 3 * i + 5 * ctr)) assert isinstance(acc, PsMemAcc) @@ -421,7 +459,9 @@ def test_invalid_mem_acc(): ptr = TypedSymbol("ptr", create_type("float64 *")) # Non-symbol pointer - acc = factory.parse_sympy(mem_acc(AddressOf(mem_acc(ptr, 10)), 3 * i + ctr * (3 + ctr))) + acc = factory.parse_sympy( + mem_acc(AddressOf(mem_acc(ptr, 10)), 3 * i + ctr * (3 + ctr)) + ) with pytest.raises(VectorizationError): _ = vectorize.visit(acc, vc) @@ -503,3 +543,107 @@ def test_invalid_buffer_acc(): with pytest.raises(VectorizationError): _ = vectorize.visit(acc, vc) + + +def test_vectorize_subscript(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + vectorize = AstVectorizer(ctx) + + ctr = ctx.get_symbol("ctr", ctx.index_dtype) + + axis = VectorizationAxis(ctr) + vc = VectorizationContext(ctx, 4, axis) + + acc = PsSubscript( + PsExpression.make(ctx.get_symbol("arr", PsArrayType(ctx.default_dtype, 42))), + [PsExpression.make(ctx.get_symbol("i", ctx.index_dtype))], + ) # independent of vectorization axis + + vec_acc = vectorize.visit(factory._typify(acc), vc) + assert isinstance(vec_acc, PsVecBroadcast) + assert isinstance(vec_acc.operand, PsSubscript) + + +def test_invalid_subscript(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + vectorize = AstVectorizer(ctx) + + ctr = ctx.get_symbol("ctr", ctx.index_dtype) + + axis = VectorizationAxis(ctr) + vc = VectorizationContext(ctx, 4, axis) + + acc = PsSubscript( + PsExpression.make(ctx.get_symbol("arr", PsArrayType(ctx.default_dtype, 42))), + [PsExpression.make(ctr)], # depends on vectorization axis + ) + + with pytest.raises(VectorizationError): + _ = vectorize.visit(factory._typify(acc), vc) + + +def test_vectorize_nested_loop(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + vectorize = AstVectorizer(ctx) + + ctr = ctx.get_symbol("i", ctx.index_dtype) + + axis = VectorizationAxis(ctr) + vc = VectorizationContext(ctx, 4, axis) + + ast = factory.loop_nest( + ("i", "j"), + make_slice[:8, :8], # inner loop does not depend on vectorization axis + PsBlock( + [ + PsDeclaration( + PsExpression.make(ctx.get_symbol("x", ctx.default_dtype)), + PsExpression.make(PsConstant(42, ctx.default_dtype)), + ) + ] + ), + ) + + vec_ast = vectorize.visit(ast, vc) + inner_loop = next( + dfs_preorder( + vec_ast, + lambda node: isinstance(node, PsLoop) and node.counter.symbol.name == "j", + ) + ) + decl = inner_loop.body.statements[0] + + assert inner_loop.step.structurally_equal( + PsExpression.make(PsConstant(1, ctx.index_dtype)) + ) + assert isinstance(decl.lhs.symbol.dtype, PsVectorType) + + +def test_invalid_nested_loop(): + ctx = KernelCreationContext() + factory = AstFactory(ctx) + vectorize = AstVectorizer(ctx) + + ctr = ctx.get_symbol("i", ctx.index_dtype) + + axis = VectorizationAxis(ctr) + vc = VectorizationContext(ctx, 4, axis) + + ast = factory.loop_nest( + ("i", "j"), + make_slice[:8, :ctr], # inner loop depends on vectorization axis + PsBlock( + [ + PsDeclaration( + PsExpression.make(ctx.get_symbol("x", ctx.default_dtype)), + PsExpression.make(PsConstant(42, ctx.default_dtype)), + ) + ] + ), + ) + + with pytest.raises(VectorizationError): + _ = vectorize.visit(ast, vc)