From 3b10456175a4cea8ffb3411eb823bb64bd58c4f4 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 13 Nov 2024 11:11:07 +0100 Subject: [PATCH] Vectorization in Default Kernel Creation Driver --- src/pystencils/backend/ast/expressions.py | 2 +- .../backend/kernelcreation/__init__.py | 3 - .../kernelcreation/cpu_optimization.py | 47 --- .../backend/kernelcreation/iteration_space.py | 16 +- src/pystencils/backend/platforms/x86.py | 53 +-- .../transformations/loop_vectorizer.py | 18 + .../backend/transformations/lower_to_c.py | 22 +- src/pystencils/config.py | 104 +++++- src/pystencils/kernelcreation.py | 323 ++++++++++++------ src/pystencils/target.py | 2 +- src/pystencils/types/meta.py | 2 +- tests/kernelcreation/test_domain_kernels.py | 95 ++++-- tests/kernelcreation/test_sliced_iteration.py | 3 - tests/nbackend/test_vectorization.py | 63 ++-- .../transformations/test_lower_to_c.py | 5 +- tests/test_quicktests.py | 53 +-- 16 files changed, 530 insertions(+), 281 deletions(-) delete mode 100644 src/pystencils/backend/kernelcreation/cpu_optimization.py diff --git a/src/pystencils/backend/ast/expressions.py b/src/pystencils/backend/ast/expressions.py index a850470ff..167d732c7 100644 --- a/src/pystencils/backend/ast/expressions.py +++ b/src/pystencils/backend/ast/expressions.py @@ -51,7 +51,7 @@ class PsExpression(PsAstNode, ABC): def get_dtype(self) -> PsType: if self._dtype is None: - raise PsInternalCompilerError("No dtype set on this expression yet.") + raise PsInternalCompilerError(f"No data type set on expression {self}.") return self._dtype diff --git a/src/pystencils/backend/kernelcreation/__init__.py b/src/pystencils/backend/kernelcreation/__init__.py index abba9d9d8..622027679 100644 --- a/src/pystencils/backend/kernelcreation/__init__.py +++ b/src/pystencils/backend/kernelcreation/__init__.py @@ -12,8 +12,6 @@ from .iteration_space import ( create_sparse_iteration_space, ) -from .cpu_optimization import optimize_cpu - __all__ = [ "KernelCreationContext", "KernelAnalysis", @@ -25,5 +23,4 @@ __all__ = [ "SparseIterationSpace", "create_full_iteration_space", "create_sparse_iteration_space", - "optimize_cpu", ] diff --git a/src/pystencils/backend/kernelcreation/cpu_optimization.py b/src/pystencils/backend/kernelcreation/cpu_optimization.py deleted file mode 100644 index 46fef6603..000000000 --- a/src/pystencils/backend/kernelcreation/cpu_optimization.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations -from typing import cast, TYPE_CHECKING - -from .context import KernelCreationContext -from ..ast.structural import PsBlock - -from ...config import CpuOptimConfig, OpenMpConfig - -if TYPE_CHECKING: - from ..platforms import GenericCpu - - -def optimize_cpu( - ctx: KernelCreationContext, - platform: GenericCpu, - kernel_ast: PsBlock, - cfg: CpuOptimConfig | None, -) -> PsBlock: - """Carry out CPU-specific optimizations according to the given configuration.""" - from ..transformations import CanonicalizeSymbols, HoistLoopInvariantDeclarations - - canonicalize = CanonicalizeSymbols(ctx, True) - kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - - hoist_invariants = HoistLoopInvariantDeclarations(ctx) - kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast)) - - if cfg is None: - return kernel_ast - - if cfg.loop_blocking: - raise NotImplementedError("Loop blocking not implemented yet.") - - if cfg.vectorize is not False: - raise NotImplementedError("Vectorization not implemented yet") - - if cfg.openmp is not False: - from ..transformations import AddOpenMP - - params = cfg.openmp if isinstance(cfg.openmp, OpenMpConfig) else OpenMpConfig() - add_omp = AddOpenMP(ctx, params) - kernel_ast = cast(PsBlock, add_omp(kernel_ast)) - - if cfg.use_cacheline_zeroing: - raise NotImplementedError("CL-zeroing not implemented yet") - - return kernel_ast diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index ba77ad24d..05e7153bf 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -209,17 +209,23 @@ class FullIterationSpace(IterationSpace): @property def archetype_field(self) -> Field | None: return self._archetype_field + + @property + def loop_order(self) -> tuple[int, ...]: + """Return the loop order of this iteration space, ordered from slowest to fastest coordinate.""" + if self._archetype_field is not None: + return self._archetype_field.layout + else: + return tuple(range(len(self.dimensions))) def dimensions_in_loop_order(self) -> Sequence[FullIterationSpace.Dimension]: """Return the dimensions of this iteration space ordered from the slowest to the fastest coordinate. - If an archetype field is specified, the field layout is used to determine the ideal loop order; + If this iteration space has an `archetype field <FullIterationSpace.archetype_field>` set, + its field layout is used to determine the ideal loop order; otherwise, the dimensions are returned as they are """ - if self._archetype_field is not None: - return [self._dimensions[i] for i in self._archetype_field.layout] - else: - return self._dimensions + return [self._dimensions[i] for i in self.loop_order] def actual_iterations( self, dimension: int | FullIterationSpace.Dimension | None = None diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index aaa8b351b..3e74e8928 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -21,6 +21,7 @@ from ..constants import PsConstant from ..exceptions import MaterializationError from .generic_cpu import GenericVectorCpu +from ..kernelcreation import KernelCreationContext from ...types.quick import Fp, SInt from ..functions import CFunction @@ -77,6 +78,28 @@ class X86VectorArch(Enum): ) return suffix + + def intrin_type(self, vtype: PsVectorType): + scalar_type = vtype.scalar_type + match scalar_type: + case Fp(16) if self >= X86VectorArch.AVX512: + suffix = "h" + case Fp(32): + suffix = "" + case Fp(64): + suffix = "d" + case SInt(_): + suffix = "i" + case _: + raise MaterializationError( + f"x86/{self} does not support scalar type {scalar_type}" + ) + + if vtype.width > self.max_vector_width: + raise MaterializationError( + f"x86/{self} does not support {vtype}" + ) + return PsCustomType(f"__m{vtype.width}{suffix}") class X86VectorCpu(GenericVectorCpu): @@ -86,7 +109,8 @@ class X86VectorCpu(GenericVectorCpu): https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html. """ - def __init__(self, vector_arch: X86VectorArch): + def __init__(self, ctx: KernelCreationContext, vector_arch: X86VectorArch): + super().__init__(ctx) self._vector_arch = vector_arch @property @@ -111,26 +135,7 @@ class X86VectorCpu(GenericVectorCpu): return super().required_headers | headers def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: - scalar_type = vector_type.scalar_type - match scalar_type: - case Fp(16) if self._vector_arch >= X86VectorArch.AVX512: - suffix = "h" - case Fp(32): - suffix = "" - case Fp(64): - suffix = "d" - case SInt(_): - suffix = "i" - case _: - raise MaterializationError( - f"x86/{self._vector_arch} does not support scalar type {scalar_type}" - ) - - if vector_type.width > self._vector_arch.max_vector_width: - raise MaterializationError( - f"x86/{self._vector_arch} does not support {vector_type}" - ) - return PsCustomType(f"__m{vector_type.width}{suffix}") + return self._vector_arch.intrin_type(vector_type) def constant_intrinsic(self, c: PsConstant) -> PsExpression: vtype = c.dtype @@ -212,12 +217,14 @@ def _x86_op_intrin( ) -> CFunction: prefix = varch.intrin_prefix(vtype) suffix = varch.intrin_suffix(vtype) + rtype = atype = varch.intrin_type(vtype) match op: case PsVecBroadcast(): opstr = "set1" if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: - suffix += "x" + suffix += "x" + atype = vtype.scalar_type case PsAdd(): opstr = "add" case PsSub(): @@ -236,4 +243,4 @@ def _x86_op_intrin( raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}") num_args = 1 if isinstance(op, PsUnOp) else 2 - return CFunction(f"{prefix}_{opstr}_{suffix}", (vtype,) * num_args, vtype) + return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index e01e657e3..c89698193 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -61,6 +61,24 @@ class LoopVectorizer: self._vectorize_ast = AstVectorizer(ctx) self._fold = EliminateConstants(ctx) + @overload + def vectorize_select_loops( + self, node: PsBlock, predicate: Callable[[PsLoop], bool] + ) -> PsBlock: + ... + + @overload + def vectorize_select_loops( + self, node: PsLoop, predicate: Callable[[PsLoop], bool] + ) -> PsLoop | PsBlock: + ... + + @overload + def vectorize_select_loops( + self, node: PsAstNode, predicate: Callable[[PsLoop], bool] + ) -> PsAstNode: + ... + def vectorize_select_loops( self, node: PsAstNode, predicate: Callable[[PsLoop], bool] ) -> PsAstNode: diff --git a/src/pystencils/backend/transformations/lower_to_c.py b/src/pystencils/backend/transformations/lower_to_c.py index 0576616f2..72ac80891 100644 --- a/src/pystencils/backend/transformations/lower_to_c.py +++ b/src/pystencils/backend/transformations/lower_to_c.py @@ -37,9 +37,13 @@ class LowerToC: def __init__(self, ctx: KernelCreationContext) -> None: self._ctx = ctx + self._substitutions: dict[PsSymbol, PsSymbol] = dict() + self._typify = Typifier(ctx) - self._substitutions: dict[PsSymbol, PsSymbol] = dict() + from .eliminate_constants import EliminateConstants + + self._fold = EliminateConstants(self._ctx) def __call__(self, node: PsAstNode) -> PsAstNode: self._substitutions = dict() @@ -65,7 +69,8 @@ class LowerToC: return i summands: list[PsExpression] = [ - maybe_cast(cast(PsExpression, self.visit(idx).clone())) * PsExpression.make(stride) + maybe_cast(cast(PsExpression, self.visit(idx).clone())) + * PsExpression.make(stride) for idx, stride in zip(indices, buf.strides, strict=True) ] @@ -77,9 +82,11 @@ class LowerToC: mem_acc = PsMemAcc(bptr.clone(), linearized_idx) - return self._typify.typify_expression( - mem_acc, target_type=buf.element_type - )[0] + return self._fold( + self._typify.typify_expression( + mem_acc, target_type=buf.element_type + )[0] + ) case PsLookup(aggr, member_name) if isinstance( aggr, PsBufferAcc @@ -115,10 +122,7 @@ class LowerToC: const=bp_type.const, restrict=bp_type.restrict, ) - type_erased_bp = PsSymbol( - bp.name, - erased_type - ) + type_erased_bp = PsSymbol(bp.name, erased_type) type_erased_bp.add_property(BufferBasePtr(buf)) self._substitutions[bp] = type_erased_bp else: diff --git a/src/pystencils/config.py b/src/pystencils/config.py index db2fd545a..9e2af1b7e 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -5,12 +5,18 @@ from warnings import warn from collections.abc import Collection from typing import Sequence -from dataclasses import dataclass, InitVar +from dataclasses import dataclass, InitVar, replace from .target import Target from .field import Field, FieldType -from .types import PsIntegerType, UserTypeSpec, PsIeeeFloatType, create_type +from .types import ( + PsIntegerType, + UserTypeSpec, + PsIeeeFloatType, + PsScalarType, + create_type, +) from .defaults import DEFAULTS @@ -90,6 +96,14 @@ class CpuOptimConfig: to produce cacheline zeroing instructions where possible. """ + def get_vectorization_config(self) -> VectorizationConfig | None: + if self.vectorize is True: + return VectorizationConfig() + elif isinstance(self.vectorize, VectorizationConfig): + return self.vectorize + else: + return None + @dataclass class VectorizationConfig: @@ -99,14 +113,13 @@ class VectorizationConfig: in `CreateKernelConfig.target`, an error will be raised. """ - vector_width: int | None = None - """Desired vector register width in bits. - - If set to an integer value, the vectorizer will use this as the desired vector register width. + lanes: int | None = None + """Number of SIMD lanes to be used in vectorization. - If set to `None`, the vector register width will be automatically set to the broadest possible. + If set to `None` (the default), the vector register width will be automatically set to the broadest possible. - If the selected CPU does not support the given width, an error will be raised. + If the CPU architecture specified in `target <CreateKernelConfig.target>` does not support some + operation contained in the kernel with the given number of lanes, an error will be raised. """ use_nontemporal_stores: bool | Collection[str | Field] = False @@ -134,6 +147,25 @@ class VectorizationConfig: that is not equal to one, an error will be raised. """ + @staticmethod + def default_lanes(target: Target, dtype: PsScalarType): + if not target.is_vector_cpu(): + raise ValueError(f"Given target {target} is no vector CPU target.") + + assert dtype.itemsize is not None + + match target: + case Target.X86_SSE: + return 128 // (dtype.itemsize * 8) + case Target.X86_AVX: + return 256 // (dtype.itemsize * 8) + case Target.X86_AVX512 | Target.X86_AVX512_FP16: + return 512 // (dtype.itemsize * 8) + case _: + raise NotImplementedError( + f"No default number of lanes known for {dtype} on {target}" + ) + @dataclass class GpuIndexingConfig: @@ -266,6 +298,13 @@ class CreateKernelConfig: # Getters + def get_target(self) -> Target: + match self.target: + case Target.CurrentCPU: + return Target.auto_cpu() + case _: + return self.target + def get_jit(self) -> JitBase: """Returns either the user-specified JIT compiler, or infers one from the target if none is given.""" if self.jit is None: @@ -371,7 +410,7 @@ class CreateKernelConfig: warn( "Setting the deprecated `data_type` will override the value of `default_dtype`. " "Set `default_dtype` instead.", - FutureWarning, + UserWarning, ) self.default_dtype = data_type @@ -394,7 +433,52 @@ class CreateKernelConfig: if cpu_vectorize_info is not None: _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize") - raise NotImplementedError("CPU vectorization is not implemented yet") + if "instruction_set" in cpu_vectorize_info: + if self.target != Target.GenericCPU: + raise PsOptionsError( + "Setting 'instruction_set' in the deprecated 'cpu_vectorize_info' option is only " + "valid if `target == Target.CPU`." + ) + + isa = cpu_vectorize_info["instruction_set"] + vec_target: Target + match isa: + case "best": + vec_target = Target.available_vector_cpu_targets().pop() + case "sse": + vec_target = Target.X86_SSE + case "avx": + vec_target = Target.X86_AVX + case "avx512": + vec_target = Target.X86_AVX512 + case "avx512vl": + vec_target = Target.X86_AVX512 | Target._VL + case _: + raise PsOptionsError( + f'Value {isa} in `cpu_vectorize_info["instruction_set"]` is not supported.' + ) + + warn( + f"Value {isa} for `instruction_set` in deprecated `cpu_vectorize_info` " + "will override the `target` option. " + f"Set `target` to {vec_target} instead.", + UserWarning, + ) + + self.target = vec_target + + deprecated_vec_opts = VectorizationConfig( + assume_inner_stride_one=cpu_vectorize_info.get( + "assume_inner_stride_one", False + ), + assume_aligned=cpu_vectorize_info.get("assume_aligned", False), + use_nontemporal_stores=cpu_vectorize_info.get("nontemporal", False), + ) + + if optim is not None: + optim = replace(optim, vectorize=deprecated_vec_opts) + else: + optim = CpuOptimConfig(vectorize=deprecated_vec_opts) if optim is not None: if self.cpu_optim is not None: diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index a10fadf01..651a67cf2 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -2,27 +2,35 @@ from typing import cast, Sequence from dataclasses import replace from .target import Target -from .config import CreateKernelConfig +from .config import ( + CreateKernelConfig, + OpenMpConfig, + VectorizationConfig, +) from .backend import KernelFunction -from .types import create_numeric_type, PsIntegerType -from .backend.ast.structural import PsBlock +from .types import create_numeric_type, PsIntegerType, PsScalarType +from .backend.ast.structural import PsBlock, PsLoop from .backend.kernelcreation import ( KernelCreationContext, KernelAnalysis, FreezeExpressions, Typifier, ) +from .backend.constants import PsConstant from .backend.kernelcreation.iteration_space import ( create_sparse_iteration_space, create_full_iteration_space, + FullIterationSpace, ) - +from .backend.platforms import Platform, GenericCpu, GenericVectorCpu, GenericGpu +from .backend.exceptions import VectorizationError from .backend.transformations import ( EliminateConstants, LowerToC, SelectFunctions, CanonicalizeSymbols, + HoistLoopInvariantDeclarations, ) from .backend.kernelfunction import ( create_cpu_kernel_function, @@ -60,125 +68,245 @@ def create_kernel( if kwargs: config = replace(config, **kwargs) - idx_dtype = create_numeric_type(config.index_dtype) - assert isinstance(idx_dtype, PsIntegerType) + driver = DefaultKernelCreationDriver(config) + return driver(assignments) - ctx = KernelCreationContext( - default_dtype=create_numeric_type(config.default_dtype), - index_dtype=idx_dtype, - ) - if isinstance(assignments, AssignmentBase): - assignments = [assignments] +class DefaultKernelCreationDriver: + def __init__(self, cfg: CreateKernelConfig): + self._cfg = cfg - if not isinstance(assignments, AssignmentCollection): - assignments = AssignmentCollection(assignments) # type: ignore + idx_dtype = create_numeric_type(self._cfg.index_dtype) + assert isinstance(idx_dtype, PsIntegerType) - _ = _parse_simplification_hints(assignments) + self._ctx = KernelCreationContext( + default_dtype=create_numeric_type(self._cfg.default_dtype), + index_dtype=idx_dtype, + ) - analysis = KernelAnalysis( - ctx, not config.skip_independence_check, not config.allow_double_writes - ) - analysis(assignments) + self._target = self._cfg.get_target() + self._platform = self._get_platform() - if len(ctx.fields.index_fields) > 0 or config.index_field is not None: - ispace = create_sparse_iteration_space( - ctx, assignments, index_field=config.index_field - ) - else: - ispace = create_full_iteration_space( - ctx, - assignments, - ghost_layers=config.ghost_layers, - iteration_slice=config.iteration_slice, + def __call__( + self, + assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase, + ): + if isinstance(assignments, AssignmentBase): + assignments = [assignments] + + if not isinstance(assignments, AssignmentCollection): + assignments = AssignmentCollection(assignments) # type: ignore + + _ = _parse_simplification_hints(assignments) + + analysis = KernelAnalysis( + self._ctx, + not self._cfg.skip_independence_check, + not self._cfg.allow_double_writes, ) + analysis(assignments) - ctx.set_iteration_space(ispace) + if len(self._ctx.fields.index_fields) > 0 or self._cfg.index_field is not None: + ispace = create_sparse_iteration_space( + self._ctx, assignments, index_field=self._cfg.index_field + ) + else: + ispace = create_full_iteration_space( + self._ctx, + assignments, + ghost_layers=self._cfg.ghost_layers, + iteration_slice=self._cfg.iteration_slice, + ) - freeze = FreezeExpressions(ctx) - kernel_body = freeze(assignments) + self._ctx.set_iteration_space(ispace) - typify = Typifier(ctx) - kernel_body = typify(kernel_body) + freeze = FreezeExpressions(self._ctx) + kernel_body = freeze(assignments) - match config.target: - case Target.GenericCPU: - from .backend.platforms import GenericCpu + typify = Typifier(self._ctx) + kernel_body = typify(kernel_body) - platform = GenericCpu(ctx) - kernel_ast = platform.materialize_iteration_space(kernel_body, ispace) + match self._platform: + case GenericCpu(): + kernel_ast = self._platform.materialize_iteration_space( + kernel_body, ispace + ) + case GenericGpu(): + kernel_ast, gpu_threads = self._platform.materialize_iteration_space( + kernel_body, ispace + ) - case target if target.is_gpu(): - match target: - case Target.SYCL: - from .backend.platforms import SyclPlatform + # Fold and extract constants + elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True) + kernel_ast = cast(PsBlock, elim_constants(kernel_ast)) - platform = SyclPlatform(ctx, config.gpu_indexing) - case Target.CUDA: - from .backend.platforms import CudaPlatform + # Target-Specific optimizations + if self._cfg.target.is_cpu(): + kernel_ast = self._transform_for_cpu(kernel_ast) + + # Note: After this point, the AST may contain intrinsics, so type-dependent + # transformations cannot be run any more + + # Lowering + lower_to_c = LowerToC(self._ctx) + kernel_ast = cast(PsBlock, lower_to_c(kernel_ast)) + + select_functions = SelectFunctions(self._platform) + kernel_ast = cast(PsBlock, select_functions(kernel_ast)) + + # Late canonicalization pass: Canonicalize new symbols introduced by LowerToC - platform = CudaPlatform(ctx, config.gpu_indexing) - case _: - raise NotImplementedError( - f"Code generation for target {target} not implemented" - ) + canonicalize = CanonicalizeSymbols(self._ctx, True) + kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - kernel_ast, gpu_threads = platform.materialize_iteration_space( - kernel_body, ispace + if self._cfg.target.is_cpu(): + return create_cpu_kernel_function( + self._ctx, + self._platform, + kernel_ast, + self._cfg.function_name, + self._cfg.target, + self._cfg.get_jit(), ) + else: + return create_gpu_kernel_function( + self._ctx, + self._platform, + kernel_ast, + gpu_threads, + self._cfg.function_name, + self._cfg.target, + self._cfg.get_jit(), + ) + + def _transform_for_cpu(self, kernel_ast: PsBlock): + canonicalize = CanonicalizeSymbols(self._ctx, True) + kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) + + hoist_invariants = HoistLoopInvariantDeclarations(self._ctx) + kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast)) + + cpu_cfg = self._cfg.cpu_optim + + if cpu_cfg is None: + return kernel_ast - case _: - raise NotImplementedError( - f"Code generation for target {target} not implemented" + if cpu_cfg.loop_blocking: + raise NotImplementedError("Loop blocking not implemented yet.") + + kernel_ast = self._vectorize(kernel_ast) + + if cpu_cfg.openmp is not False: + from .backend.transformations import AddOpenMP + + params = ( + cpu_cfg.openmp + if isinstance(cpu_cfg.openmp, OpenMpConfig) + else OpenMpConfig() ) + add_omp = AddOpenMP(self._ctx, params) + kernel_ast = cast(PsBlock, add_omp(kernel_ast)) + + if cpu_cfg.use_cacheline_zeroing: + raise NotImplementedError("CL-zeroing not implemented yet") - # Fold and extract constants - elim_constants = EliminateConstants(ctx, extract_constant_exprs=True) - kernel_ast = cast(PsBlock, elim_constants(kernel_ast)) + return kernel_ast - # Target-Specific optimizations - if config.target.is_cpu(): - from .backend.kernelcreation import optimize_cpu + def _vectorize(self, kernel_ast: PsBlock) -> PsBlock: + assert self._cfg.cpu_optim is not None + vec_config = self._cfg.cpu_optim.get_vectorization_config() + if vec_config is None: + return kernel_ast - assert isinstance(platform, GenericCpu) + from .backend.transformations import LoopVectorizer, SelectIntrinsics - kernel_ast = optimize_cpu(ctx, platform, kernel_ast, config.cpu_optim) + assert isinstance(self._platform, GenericVectorCpu) - # Lowering - lower_to_c = LowerToC(ctx) - kernel_ast = cast(PsBlock, lower_to_c(kernel_ast)) + ispace = self._ctx.get_iteration_space() + if not isinstance(ispace, FullIterationSpace): + raise VectorizationError( + "Unable to vectorize kernel: The kernel is not using a dense iteration space." + ) - select_functions = SelectFunctions(platform) - kernel_ast = cast(PsBlock, select_functions(kernel_ast)) + inner_loop_coord = ispace.loop_order[-1] + inner_loop_dim = ispace.dimensions[inner_loop_coord] + + # Apply stride (TODO: and alignment) assumptions + if vec_config.assume_inner_stride_one: + for field in self._ctx.fields: + buf = self._ctx.get_buffer(field) + inner_stride = buf.strides[inner_loop_coord] + if isinstance(inner_stride, PsConstant): + if inner_stride.value != 1: + raise VectorizationError( + f"Unable to apply assumption 'assume_inner_stride_one': " + f"Field {field} has fixed stride {inner_stride} " + f"set in the inner coordinate {inner_loop_coord}." + ) + else: + buf.strides[inner_loop_coord] = PsConstant(1, buf.index_type) + # TODO: Communicate assumption to runtime system via a precondition + + # Call loop vectorizer + if vec_config.lanes is None: + lanes = VectorizationConfig.default_lanes( + self._target, cast(PsScalarType, self._ctx.default_dtype) + ) + else: + lanes = vec_config.lanes + + vectorizer = LoopVectorizer(self._ctx, lanes) + + def loop_predicate(loop: PsLoop): + return loop.counter.symbol == inner_loop_dim.counter + + kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate) + + select_intrin = SelectIntrinsics(self._ctx, self._platform) + kernel_ast = cast(PsBlock, select_intrin(kernel_ast)) + + return kernel_ast + + def _get_platform(self) -> Platform: + if Target._CPU in self._target: + if Target._X86 in self._target: + from .backend.platforms.x86 import X86VectorArch, X86VectorCpu + + arch: X86VectorArch + + if Target._SSE in self._target: + arch = X86VectorArch.SSE + elif Target._AVX in self._target: + arch = X86VectorArch.AVX + elif Target._AVX512 in self._target: + if Target._FP16 in self._target: + arch = X86VectorArch.AVX512_FP16 + else: + arch = X86VectorArch.AVX512 + else: + assert False, "unreachable code" + + return X86VectorCpu(self._ctx, arch) + elif self._target == Target.GenericCPU: + return GenericCpu(self._ctx) + else: + raise NotImplementedError( + f"No platform is currently available for CPU target {self._target}" + ) + + elif Target._GPU in self._target: + match self._target: + case Target.SYCL: + from .backend.platforms import SyclPlatform - # Late canonicalization and constant elimination passes - # * Since lowering introduces new index calculations and indexing symbols into the AST, - # * these need to be handled here - - canonicalize = CanonicalizeSymbols(ctx, True) - kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) + return SyclPlatform(self._ctx, self._cfg.gpu_indexing) + case Target.CUDA: + from .backend.platforms import CudaPlatform - late_fold_constants = EliminateConstants(ctx, extract_constant_exprs=False) - kernel_ast = cast(PsBlock, late_fold_constants(kernel_ast)) + return CudaPlatform(self._ctx, self._cfg.gpu_indexing) - if config.target.is_cpu(): - return create_cpu_kernel_function( - ctx, - platform, - kernel_ast, - config.function_name, - config.target, - config.get_jit(), - ) - else: - return create_gpu_kernel_function( - ctx, - platform, - kernel_ast, - gpu_threads, - config.function_name, - config.target, - config.get_jit(), + raise NotImplementedError( + f"Code generation for target {self._target} not implemented" ) @@ -192,6 +320,9 @@ def create_staggered_kernel( # Internals + def _parse_simplification_hints(ac: AssignmentCollection): if "split_groups" in ac.simplification_hints: - raise NotImplementedError("Loop splitting was requested, but is not implemented yet") + raise NotImplementedError( + "Loop splitting was requested, but is not implemented yet" + ) diff --git a/src/pystencils/target.py b/src/pystencils/target.py index 7ed4f719d..7f26c4466 100644 --- a/src/pystencils/target.py +++ b/src/pystencils/target.py @@ -118,7 +118,7 @@ class Target(Flag): @staticmethod def available_vector_cpu_targets() -> list[Target]: - """Returns a list of available (vector) CPU targets, ordered from least to most capable.""" + """Returns a list of available vector CPU targets, ordered from least to most capable.""" return _available_vector_targets() diff --git a/src/pystencils/types/meta.py b/src/pystencils/types/meta.py index 9389279bc..cf07e9aff 100644 --- a/src/pystencils/types/meta.py +++ b/src/pystencils/types/meta.py @@ -138,7 +138,7 @@ class PsType(metaclass=PsTypeMeta): @property def itemsize(self) -> int | None: - """If this type has a valid in-memory size, return that size.""" + """If this type has a valid in-memory size, return that size in bytes.""" return None @property diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py index 5850c94d7..b9cebb354 100644 --- a/tests/kernelcreation/test_domain_kernels.py +++ b/tests/kernelcreation/test_domain_kernels.py @@ -2,32 +2,84 @@ import pytest import sympy as sp import numpy as np -from pystencils import fields, Field, AssignmentCollection, Target, CreateKernelConfig +from dataclasses import replace + +from pystencils import ( + fields, + Field, + AssignmentCollection, + Target, + CreateKernelConfig, + CpuOptimConfig, + VectorizationConfig, +) from pystencils.assignment import assignment_from_stencil -from pystencils.kernelcreation import create_kernel +from pystencils.kernelcreation import create_kernel, KernelFunction +from pystencils.backend.emission import emit_code +AVAILABLE_TARGETS = [Target.GenericCPU] -@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA)) -def test_filter_kernel(target): - if target == Target.CUDA: - xp = pytest.importorskip("cupy") +try: + import cupy + + AVAILABLE_TARGETS += [Target.CUDA] +except ImportError: + pass + +AVAILABLE_TARGETS += Target.available_vector_cpu_targets() +TEST_IDS = [t.name for t in AVAILABLE_TARGETS] + + +@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS) +def gen_config(request): + target: Target = request.param + + gen_config = CreateKernelConfig(target=target) + + if Target._VECTOR in target: + gen_config = replace( + gen_config, + cpu_optim=CpuOptimConfig( + vectorize=VectorizationConfig(assume_inner_stride_one=True) + ), + ) + + return gen_config + + +def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig): + code = emit_code(kernel) + + match gen_config.target: + case Target.X86_SSE: + assert "_mm_loadu_pd" in code + assert "_mm_storeu_pd" in code + case Target.X86_AVX: + assert "_mm256_loadu_pd" in code + assert "_mm256_storeu_pd" in code + case Target.X86_AVX512: + assert "_mm512_loadu_pd" in code + assert "_mm512_storeu_pd" in code + + +def test_filter_kernel(gen_config): + if gen_config.target == Target.CUDA: + import cupy as cp + + xp = cp else: xp = np weight = sp.Symbol("weight") - stencil = [ - [1, 1, 1], - [1, 1, 1], - [1, 1, 1] - ] + stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] src, dst = fields("src, dst: [2D]") asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight) asms = AssignmentCollection([asm]) - gen_config = CreateKernelConfig(target=target) ast = create_kernel(asms, gen_config) + inspect_dp_kernel(ast, gen_config) kernel = ast.compile() src_arr = xp.ones((42, 31)) @@ -41,31 +93,28 @@ def test_filter_kernel(target): xp.testing.assert_allclose(dst_arr, expected) -@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA)) -def test_filter_kernel_fixedsize(target): - if target == Target.CUDA: - xp = pytest.importorskip("cupy") +def test_filter_kernel_fixedsize(gen_config): + if gen_config.target == Target.CUDA: + import cupy as cp + + xp = cp else: xp = np weight = sp.Symbol("weight") - stencil = [ - [1, 1, 1], - [1, 1, 1], - [1, 1, 1] - ] + stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] src_arr = xp.ones((42, 31)) dst_arr = xp.zeros_like(src_arr) src = Field.create_from_numpy_array("src", src_arr) dst = Field.create_from_numpy_array("dst", dst_arr) - + asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight) asms = AssignmentCollection([asm]) - gen_config = CreateKernelConfig(target=target) ast = create_kernel(asms, gen_config) + inspect_dp_kernel(ast, gen_config) kernel = ast.compile() kernel(src=src_arr, dst=dst_arr, weight=2.0) diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py index 8238a97da..5eff0a89d 100644 --- a/tests/kernelcreation/test_sliced_iteration.py +++ b/tests/kernelcreation/test_sliced_iteration.py @@ -27,6 +27,3 @@ def test_sliced_iteration(): expected_result = np.zeros(size) expected_result[1:x_end_value, 1] = 1 np.testing.assert_almost_equal(expected_result, dst_arr) - - -test_sliced_iteration() diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py index 0af614b23..aeb0ebaa0 100644 --- a/tests/nbackend/test_vectorization.py +++ b/tests/nbackend/test_vectorization.py @@ -3,6 +3,8 @@ import sympy as sp import numpy as np from dataclasses import dataclass from itertools import chain +from functools import partial +from typing import Callable from pystencils.backend.kernelcreation import ( KernelCreationContext, @@ -28,64 +30,52 @@ from pystencils.types.quick import SInt, Fp @dataclass class VectorTestSetup: - platform: GenericVectorCpu + target: Target + platform_factory: Callable[[KernelCreationContext], GenericVectorCpu] lanes: int numeric_dtype: PsScalarType index_dtype: PsIntegerType @property def name(self) -> str: - if isinstance(self.platform, X86VectorCpu): - match self.platform.vector_arch: - case X86VectorArch.SSE: - isa = "SSE" - case X86VectorArch.AVX: - isa = "AVX" - case X86VectorArch.AVX512: - isa = "AVX512" - case X86VectorArch.AVX512_FP16: - isa = "AVX512_FP16" - else: - assert False - - return f"{isa}/{self.numeric_dtype}<{self.lanes}>/{self.index_dtype}" + return f"{self.target.name}/{self.numeric_dtype}<{self.lanes}>/{self.index_dtype}" def get_setups(target: Target) -> list[VectorTestSetup]: match target: case Target.X86_SSE: - sse_platform = X86VectorCpu(X86VectorArch.SSE) + sse_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.SSE) return [ - VectorTestSetup(sse_platform, 4, Fp(32), SInt(32)), - VectorTestSetup(sse_platform, 2, Fp(64), SInt(64)), + VectorTestSetup(target, sse_platform, 4, Fp(32), SInt(32)), + VectorTestSetup(target, sse_platform, 2, Fp(64), SInt(64)), ] case Target.X86_AVX: - avx_platform = X86VectorCpu(X86VectorArch.AVX) + avx_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX) return [ - VectorTestSetup(avx_platform, 4, Fp(32), SInt(32)), - VectorTestSetup(avx_platform, 8, Fp(32), SInt(32)), - VectorTestSetup(avx_platform, 2, Fp(64), SInt(64)), - VectorTestSetup(avx_platform, 4, Fp(64), SInt(64)), + VectorTestSetup(target, avx_platform, 4, Fp(32), SInt(32)), + VectorTestSetup(target, avx_platform, 8, Fp(32), SInt(32)), + VectorTestSetup(target, avx_platform, 2, Fp(64), SInt(64)), + VectorTestSetup(target, avx_platform, 4, Fp(64), SInt(64)), ] case Target.X86_AVX512: - avx512_platform = X86VectorCpu(X86VectorArch.AVX512) + avx512_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX512) return [ - VectorTestSetup(avx512_platform, 4, Fp(32), SInt(32)), - VectorTestSetup(avx512_platform, 8, Fp(32), SInt(32)), - VectorTestSetup(avx512_platform, 16, Fp(32), SInt(32)), - VectorTestSetup(avx512_platform, 2, Fp(64), SInt(64)), - VectorTestSetup(avx512_platform, 4, Fp(64), SInt(64)), - VectorTestSetup(avx512_platform, 8, Fp(64), SInt(64)), + VectorTestSetup(target, avx512_platform, 4, Fp(32), SInt(32)), + VectorTestSetup(target, avx512_platform, 8, Fp(32), SInt(32)), + VectorTestSetup(target, avx512_platform, 16, Fp(32), SInt(32)), + VectorTestSetup(target, avx512_platform, 2, Fp(64), SInt(64)), + VectorTestSetup(target, avx512_platform, 4, Fp(64), SInt(64)), + VectorTestSetup(target, avx512_platform, 8, Fp(64), SInt(64)), ] case Target.X86_AVX512_FP16: - avx512_platform = X86VectorCpu(X86VectorArch.AVX512_FP16) + avx512_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX512_FP16) return [ - VectorTestSetup(avx512_platform, 8, Fp(16), SInt(32)), - VectorTestSetup(avx512_platform, 16, Fp(16), SInt(32)), - VectorTestSetup(avx512_platform, 32, Fp(16), SInt(32)), + VectorTestSetup(target, avx512_platform, 8, Fp(16), SInt(32)), + VectorTestSetup(target, avx512_platform, 16, Fp(16), SInt(32)), + VectorTestSetup(target, avx512_platform, 32, Fp(16), SInt(32)), ] case _: @@ -108,6 +98,7 @@ def create_vector_kernel( ctx = KernelCreationContext( default_dtype=setup.numeric_dtype, index_dtype=setup.index_dtype ) + platform = setup.platform_factory(ctx) factory = AstFactory(ctx) @@ -129,7 +120,7 @@ def create_vector_kernel( loop_nest, lambda l: l.counter.symbol.name == "ctr_0" ) - select_intrin = SelectIntrinsics(ctx, setup.platform) + select_intrin = SelectIntrinsics(ctx, platform) loop_nest = select_intrin(loop_nest) lower = LowerToC(ctx) @@ -137,7 +128,7 @@ def create_vector_kernel( func = create_cpu_kernel_function( ctx, - setup.platform, + platform, PsBlock([loop_nest]), "vector_kernel", Target.CPU, diff --git a/tests/nbackend/transformations/test_lower_to_c.py b/tests/nbackend/transformations/test_lower_to_c.py index b557a7493..75e6daf4b 100644 --- a/tests/nbackend/transformations/test_lower_to_c.py +++ b/tests/nbackend/transformations/test_lower_to_c.py @@ -51,14 +51,13 @@ def test_lower_buffer_accesses(): assert isinstance(fasm_lowered.lhs.pointer, PsSymbolExpr) assert fasm_lowered.lhs.pointer.symbol == f_buf.base_pointer - zero = factory.parse_index(0) expected_offset = reduce( add, ( - (PsExpression.make(dm.counter) + zero) * PsExpression.make(stride) + (PsExpression.make(dm.counter)) * PsExpression.make(stride) for dm, stride in zip(ispace.dimensions, f_buf.strides) ), - ) + factory.parse_index(1) * PsExpression.make(f_buf.strides[-1]) + ) + PsExpression.make(f_buf.strides[-1]) assert fasm_lowered.lhs.offset.structurally_equal(expected_offset) assert isinstance(fasm_lowered.rhs, PsMemAcc) diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py index 506e2bf2c..5d5dba0ea 100644 --- a/tests/test_quicktests.py +++ b/tests/test_quicktests.py @@ -9,25 +9,32 @@ def test_basic_kernel(): dh = ps.create_data_handling(domain_size=domain_shape, periodicity=True) assert all(dh.periodicity) - f = dh.add_array('f', values_per_cell=1) - tmp = dh.add_array('tmp', values_per_cell=1) + f = dh.add_array("f", values_per_cell=1) + tmp = dh.add_array("tmp", values_per_cell=1) stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)] - stencil_3d = [(1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)] + stencil_3d = [ + (1, 0, 0), + (-1, 0, 0), + (0, 1, 0), + (0, -1, 0), + (0, 0, 1), + (0, 0, -1), + ] stencil = stencil_2d if dh.dim == 2 else stencil_3d jacobi = ps.Assignment(tmp.center, sum(f.neighbors(stencil)) / len(stencil)) kernel = ps.create_kernel(jacobi).compile() for b in dh.iterate(ghost_layers=1): - b['f'].fill(42) + b["f"].fill(42) dh.run_kernel(kernel) for b in dh.iterate(ghost_layers=0): - np.testing.assert_equal(b['f'], 42) + np.testing.assert_equal(b["f"], 42) float_seq = [1.0, 2.0, 3.0, 4.0] int_seq = [1, 2, 3] - for op in ('min', 'max', 'sum'): + for op in ("min", "max", "sum"): assert (dh.reduce_float_sequence(float_seq, op) == float_seq).all() assert (dh.reduce_int_sequence(int_seq, op) == int_seq).all() @@ -37,10 +44,13 @@ def test_basic_blocking_staggered(): f = ps.fields("f: double[2D]") stag = ps.fields("stag(2): double[2D]", field_type=ps.FieldType.STAGGERED) terms = [ - f[0, 0] - f[-1, 0], - f[0, 0] - f[0, -1], + f[0, 0] - f[-1, 0], + f[0, 0] - f[0, -1], + ] + assignments = [ + ps.Assignment(stag.staggered_access(d), terms[i]) + for i, d in enumerate(stag.staggered_stencil) ] - assignments = [ps.Assignment(stag.staggered_access(d), terms[i]) for i, d in enumerate(stag.staggered_stencil)] kernel = ps.create_staggered_kernel(assignments, cpu_blocking=(3, 16)).compile() reference_kernel = ps.create_staggered_kernel(assignments).compile() @@ -52,24 +62,27 @@ def test_basic_blocking_staggered(): np.testing.assert_almost_equal(stag_arr, stag_ref) -@pytest.mark.xfail(reason="Vectorization not implemented yet") def test_basic_vectorization(): - supported_instruction_sets = get_supported_instruction_sets() - if supported_instruction_sets: - instruction_set = supported_instruction_sets[-1] - else: - instruction_set = None + target = ps.Target.auto_cpu() + if not target.is_vector_cpu(): + pytest.skip("No vector CPU available") f, g = ps.fields("f, g : double[2D]") - update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] - ast = ps.create_kernel(update_rule) + update_rule = [ + ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0) + ] + ast = ps.create_kernel( + update_rule, + target=target, + cpu_optim=ps.CpuOptimConfig( + vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) + ), + ) - replace_inner_stride_with_one(ast) - vectorize(ast, instruction_set=instruction_set) func = ast.compile() arr = np.ones((23 + 2, 17 + 2)) * 5.0 dst = np.zeros_like(arr) func(g=dst, f=arr) - np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0) \ No newline at end of file + np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0) -- GitLab