diff --git a/src/pystencils/backend/ast/expressions.py b/src/pystencils/backend/ast/expressions.py index a850470ffacf4f528ca5883e5d91b14fa6aa5f9c..167d732c7c9aaa0560798aef5242b9a9eb4e511b 100644 --- a/src/pystencils/backend/ast/expressions.py +++ b/src/pystencils/backend/ast/expressions.py @@ -51,7 +51,7 @@ class PsExpression(PsAstNode, ABC): def get_dtype(self) -> PsType: if self._dtype is None: - raise PsInternalCompilerError("No dtype set on this expression yet.") + raise PsInternalCompilerError(f"No data type set on expression {self}.") return self._dtype diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index f6d9f0993b9e42c24869b30f14101deaab7f6052..3e74e89282b44011295c742d96b35ce63401bc67 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -78,6 +78,28 @@ class X86VectorArch(Enum): ) return suffix + + def intrin_type(self, vtype: PsVectorType): + scalar_type = vtype.scalar_type + match scalar_type: + case Fp(16) if self >= X86VectorArch.AVX512: + suffix = "h" + case Fp(32): + suffix = "" + case Fp(64): + suffix = "d" + case SInt(_): + suffix = "i" + case _: + raise MaterializationError( + f"x86/{self} does not support scalar type {scalar_type}" + ) + + if vtype.width > self.max_vector_width: + raise MaterializationError( + f"x86/{self} does not support {vtype}" + ) + return PsCustomType(f"__m{vtype.width}{suffix}") class X86VectorCpu(GenericVectorCpu): @@ -113,26 +135,7 @@ class X86VectorCpu(GenericVectorCpu): return super().required_headers | headers def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: - scalar_type = vector_type.scalar_type - match scalar_type: - case Fp(16) if self._vector_arch >= X86VectorArch.AVX512: - suffix = "h" - case Fp(32): - suffix = "" - case Fp(64): - suffix = "d" - case SInt(_): - suffix = "i" - case _: - raise MaterializationError( - f"x86/{self._vector_arch} does not support scalar type {scalar_type}" - ) - - if vector_type.width > self._vector_arch.max_vector_width: - raise MaterializationError( - f"x86/{self._vector_arch} does not support {vector_type}" - ) - return PsCustomType(f"__m{vector_type.width}{suffix}") + return self._vector_arch.intrin_type(vector_type) def constant_intrinsic(self, c: PsConstant) -> PsExpression: vtype = c.dtype @@ -214,12 +217,14 @@ def _x86_op_intrin( ) -> CFunction: prefix = varch.intrin_prefix(vtype) suffix = varch.intrin_suffix(vtype) + rtype = atype = varch.intrin_type(vtype) match op: case PsVecBroadcast(): opstr = "set1" if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: - suffix += "x" + suffix += "x" + atype = vtype.scalar_type case PsAdd(): opstr = "add" case PsSub(): @@ -238,4 +243,4 @@ def _x86_op_intrin( raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}") num_args = 1 if isinstance(op, PsUnOp) else 2 - return CFunction(f"{prefix}_{opstr}_{suffix}", (vtype,) * num_args, vtype) + return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) diff --git a/src/pystencils/backend/transformations/lower_to_c.py b/src/pystencils/backend/transformations/lower_to_c.py index 0576616f2f2989ae72887f5cd720f263017a6b0a..62183fdf7f54e295b7ec459026f248b4018c4546 100644 --- a/src/pystencils/backend/transformations/lower_to_c.py +++ b/src/pystencils/backend/transformations/lower_to_c.py @@ -18,7 +18,7 @@ from ..ast.expressions import ( PsCast, PsSymbolExpr, ) -from ...types import PsStructType, PsPointerType, PsUnsignedIntegerType +from ...types import PsType, PsStructType, PsPointerType, PsUnsignedIntegerType class LowerToC: @@ -37,9 +37,13 @@ class LowerToC: def __init__(self, ctx: KernelCreationContext) -> None: self._ctx = ctx + self._substitutions: dict[PsSymbol, PsSymbol] = dict() + self._typify = Typifier(ctx) - self._substitutions: dict[PsSymbol, PsSymbol] = dict() + from .eliminate_constants import EliminateConstants + + self._fold = EliminateConstants(self._ctx) def __call__(self, node: PsAstNode) -> PsAstNode: self._substitutions = dict() @@ -65,7 +69,8 @@ class LowerToC: return i summands: list[PsExpression] = [ - maybe_cast(cast(PsExpression, self.visit(idx).clone())) * PsExpression.make(stride) + maybe_cast(cast(PsExpression, self.visit(idx).clone())) + * PsExpression.make(stride) for idx, stride in zip(indices, buf.strides, strict=True) ] @@ -77,9 +82,11 @@ class LowerToC: mem_acc = PsMemAcc(bptr.clone(), linearized_idx) - return self._typify.typify_expression( - mem_acc, target_type=buf.element_type - )[0] + return self._fold( + self._typify.typify_expression( + mem_acc, target_type=buf.element_type + )[0] + ) case PsLookup(aggr, member_name) if isinstance( aggr, PsBufferAcc @@ -115,10 +122,7 @@ class LowerToC: const=bp_type.const, restrict=bp_type.restrict, ) - type_erased_bp = PsSymbol( - bp.name, - erased_type - ) + type_erased_bp = PsSymbol(bp.name, erased_type) type_erased_bp.add_property(BufferBasePtr(buf)) self._substitutions[bp] = type_erased_bp else: diff --git a/src/pystencils/config.py b/src/pystencils/config.py index e12a820343fb17530742e7c1c404d6595edc0f9e..9e2af1b7e3cebe1e24b3a103ba72d677fc2d6d38 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -5,7 +5,7 @@ from warnings import warn from collections.abc import Collection from typing import Sequence -from dataclasses import dataclass, InitVar +from dataclasses import dataclass, InitVar, replace from .target import Target from .field import Field, FieldType @@ -410,7 +410,7 @@ class CreateKernelConfig: warn( "Setting the deprecated `data_type` will override the value of `default_dtype`. " "Set `default_dtype` instead.", - FutureWarning, + UserWarning, ) self.default_dtype = data_type @@ -433,7 +433,52 @@ class CreateKernelConfig: if cpu_vectorize_info is not None: _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize") - raise NotImplementedError("CPU vectorization is not implemented yet") + if "instruction_set" in cpu_vectorize_info: + if self.target != Target.GenericCPU: + raise PsOptionsError( + "Setting 'instruction_set' in the deprecated 'cpu_vectorize_info' option is only " + "valid if `target == Target.CPU`." + ) + + isa = cpu_vectorize_info["instruction_set"] + vec_target: Target + match isa: + case "best": + vec_target = Target.available_vector_cpu_targets().pop() + case "sse": + vec_target = Target.X86_SSE + case "avx": + vec_target = Target.X86_AVX + case "avx512": + vec_target = Target.X86_AVX512 + case "avx512vl": + vec_target = Target.X86_AVX512 | Target._VL + case _: + raise PsOptionsError( + f'Value {isa} in `cpu_vectorize_info["instruction_set"]` is not supported.' + ) + + warn( + f"Value {isa} for `instruction_set` in deprecated `cpu_vectorize_info` " + "will override the `target` option. " + f"Set `target` to {vec_target} instead.", + UserWarning, + ) + + self.target = vec_target + + deprecated_vec_opts = VectorizationConfig( + assume_inner_stride_one=cpu_vectorize_info.get( + "assume_inner_stride_one", False + ), + assume_aligned=cpu_vectorize_info.get("assume_aligned", False), + use_nontemporal_stores=cpu_vectorize_info.get("nontemporal", False), + ) + + if optim is not None: + optim = replace(optim, vectorize=deprecated_vec_opts) + else: + optim = CpuOptimConfig(vectorize=deprecated_vec_opts) if optim is not None: if self.cpu_optim is not None: diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index ecf6264717358375d4da7f7880e83588bea24790..651a67cf2092a93e9e1cab3f393c2edd1baf15a9 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -144,6 +144,9 @@ class DefaultKernelCreationDriver: if self._cfg.target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) + # Note: After this point, the AST may contain intrinsics, so type-dependent + # transformations cannot be run any more + # Lowering lower_to_c = LowerToC(self._ctx) kernel_ast = cast(PsBlock, lower_to_c(kernel_ast)) @@ -151,18 +154,11 @@ class DefaultKernelCreationDriver: select_functions = SelectFunctions(self._platform) kernel_ast = cast(PsBlock, select_functions(kernel_ast)) - # Late canonicalization and constant elimination passes - # * Since lowering introduces new index calculations and indexing symbols into the AST, - # * these need to be handled here + # Late canonicalization pass: Canonicalize new symbols introduced by LowerToC canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - late_fold_constants = EliminateConstants( - self._ctx, extract_constant_exprs=False - ) - kernel_ast = cast(PsBlock, late_fold_constants(kernel_ast)) - if self._cfg.target.is_cpu(): return create_cpu_kernel_function( self._ctx, diff --git a/tests/nbackend/transformations/test_lower_to_c.py b/tests/nbackend/transformations/test_lower_to_c.py index b557a7493f9a84cb13b511e8fca1f898823bc9bb..75e6daf4bc67014183f834e5b38fc8245bd13a4f 100644 --- a/tests/nbackend/transformations/test_lower_to_c.py +++ b/tests/nbackend/transformations/test_lower_to_c.py @@ -51,14 +51,13 @@ def test_lower_buffer_accesses(): assert isinstance(fasm_lowered.lhs.pointer, PsSymbolExpr) assert fasm_lowered.lhs.pointer.symbol == f_buf.base_pointer - zero = factory.parse_index(0) expected_offset = reduce( add, ( - (PsExpression.make(dm.counter) + zero) * PsExpression.make(stride) + (PsExpression.make(dm.counter)) * PsExpression.make(stride) for dm, stride in zip(ispace.dimensions, f_buf.strides) ), - ) + factory.parse_index(1) * PsExpression.make(f_buf.strides[-1]) + ) + PsExpression.make(f_buf.strides[-1]) assert fasm_lowered.lhs.offset.structurally_equal(expected_offset) assert isinstance(fasm_lowered.rhs, PsMemAcc) diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py index 506e2bf2ca8a2d6b65212f5bea5caf715201773d..5d5dba0eada50ed7c7f2a6f4f3ddb342781e23b6 100644 --- a/tests/test_quicktests.py +++ b/tests/test_quicktests.py @@ -9,25 +9,32 @@ def test_basic_kernel(): dh = ps.create_data_handling(domain_size=domain_shape, periodicity=True) assert all(dh.periodicity) - f = dh.add_array('f', values_per_cell=1) - tmp = dh.add_array('tmp', values_per_cell=1) + f = dh.add_array("f", values_per_cell=1) + tmp = dh.add_array("tmp", values_per_cell=1) stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)] - stencil_3d = [(1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)] + stencil_3d = [ + (1, 0, 0), + (-1, 0, 0), + (0, 1, 0), + (0, -1, 0), + (0, 0, 1), + (0, 0, -1), + ] stencil = stencil_2d if dh.dim == 2 else stencil_3d jacobi = ps.Assignment(tmp.center, sum(f.neighbors(stencil)) / len(stencil)) kernel = ps.create_kernel(jacobi).compile() for b in dh.iterate(ghost_layers=1): - b['f'].fill(42) + b["f"].fill(42) dh.run_kernel(kernel) for b in dh.iterate(ghost_layers=0): - np.testing.assert_equal(b['f'], 42) + np.testing.assert_equal(b["f"], 42) float_seq = [1.0, 2.0, 3.0, 4.0] int_seq = [1, 2, 3] - for op in ('min', 'max', 'sum'): + for op in ("min", "max", "sum"): assert (dh.reduce_float_sequence(float_seq, op) == float_seq).all() assert (dh.reduce_int_sequence(int_seq, op) == int_seq).all() @@ -37,10 +44,13 @@ def test_basic_blocking_staggered(): f = ps.fields("f: double[2D]") stag = ps.fields("stag(2): double[2D]", field_type=ps.FieldType.STAGGERED) terms = [ - f[0, 0] - f[-1, 0], - f[0, 0] - f[0, -1], + f[0, 0] - f[-1, 0], + f[0, 0] - f[0, -1], + ] + assignments = [ + ps.Assignment(stag.staggered_access(d), terms[i]) + for i, d in enumerate(stag.staggered_stencil) ] - assignments = [ps.Assignment(stag.staggered_access(d), terms[i]) for i, d in enumerate(stag.staggered_stencil)] kernel = ps.create_staggered_kernel(assignments, cpu_blocking=(3, 16)).compile() reference_kernel = ps.create_staggered_kernel(assignments).compile() @@ -52,24 +62,27 @@ def test_basic_blocking_staggered(): np.testing.assert_almost_equal(stag_arr, stag_ref) -@pytest.mark.xfail(reason="Vectorization not implemented yet") def test_basic_vectorization(): - supported_instruction_sets = get_supported_instruction_sets() - if supported_instruction_sets: - instruction_set = supported_instruction_sets[-1] - else: - instruction_set = None + target = ps.Target.auto_cpu() + if not target.is_vector_cpu(): + pytest.skip("No vector CPU available") f, g = ps.fields("f, g : double[2D]") - update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] - ast = ps.create_kernel(update_rule) + update_rule = [ + ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0) + ] + ast = ps.create_kernel( + update_rule, + target=target, + cpu_optim=ps.CpuOptimConfig( + vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) + ), + ) - replace_inner_stride_with_one(ast) - vectorize(ast, instruction_set=instruction_set) func = ast.compile() arr = np.ones((23 + 2, 17 + 2)) * 5.0 dst = np.zeros_like(arr) func(g=dst, f=arr) - np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0) \ No newline at end of file + np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)