diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 051e02e7f59dfc1e17b86e03f2218772b2fa4163..761b11e99af82f1a8ca60673163fa9b511aa5153 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -18,6 +18,7 @@ from .config import ( from .kernel_decorator import kernel, kernel_config from .kernelcreation import create_kernel from .backend.kernelfunction import KernelFunction +from .backend.jit import no_jit from .slicing import make_slice from .spatial_coordinates import ( x_, @@ -51,6 +52,7 @@ __all__ = [ "create_kernel", "KernelFunction", "Target", + "no_jit", "show_code", "to_dot", "get_code_obj", diff --git a/src/pystencils/backend/__init__.py b/src/pystencils/backend/__init__.py index 646fc3055a68c4ca3ff63035b2a72a80591e51a4..a0b1c8f747984e3fffde5a336f40e2aa46ad631d 100644 --- a/src/pystencils/backend/__init__.py +++ b/src/pystencils/backend/__init__.py @@ -5,6 +5,7 @@ from .kernelfunction import ( FieldStrideParam, FieldPointerParam, KernelFunction, + GpuKernelFunction, ) from .constraints import KernelParamsConstraint @@ -16,5 +17,6 @@ __all__ = [ "FieldStrideParam", "FieldPointerParam", "KernelFunction", + "GpuKernelFunction", "KernelParamsConstraint", ] diff --git a/src/pystencils/backend/emission.py b/src/pystencils/backend/emission.py index fdc81a47c2cec1c73fab8042f6cfb61ae2081dc5..9756d21f33550571666f0413ac6c4516b90eb0da 100644 --- a/src/pystencils/backend/emission.py +++ b/src/pystencils/backend/emission.py @@ -1,6 +1,8 @@ from __future__ import annotations from enum import Enum +from ..enums import Target + from .ast.structural import ( PsAstNode, PsBlock, @@ -53,7 +55,7 @@ from .extensions.foreign_ast import PsForeignExpression from .symbols import PsSymbol from ..types import PsScalarType, PsArrayType -from .kernelfunction import KernelFunction +from .kernelfunction import KernelFunction, GpuKernelFunction __all__ = ["emit_code", "CAstPrinter"] @@ -167,10 +169,13 @@ class CAstPrinter: def __call__(self, obj: PsAstNode | KernelFunction) -> str: if isinstance(obj, KernelFunction): + prefix = self._func_prefix(obj) + params_str = ", ".join( f"{p.dtype.c_string()} {p.name}" for p in obj.parameters ) - decl = f"FUNC_PREFIX void {obj.name} ({params_str})" + + decl = " ".join([prefix, "void", obj.name, f"({params_str})"]) body_code = self.visit(obj.body, PrinterCtx()) return f"{decl}\n{body_code}" else: @@ -336,7 +341,7 @@ class CAstPrinter: items_str = ", ".join(self.visit(item, pc) for item in items) pc.pop_op() return "{ " + items_str + " }" - + case PsForeignExpression(children): pc.push_op(Ops.Weakest, LR.Middle) foreign_code = node.get_code(self.visit(c, pc) for c in children) @@ -346,6 +351,12 @@ class CAstPrinter: case _: raise NotImplementedError(f"Don't know how to print {node}") + def _func_prefix(self, func: KernelFunction): + if isinstance(func, GpuKernelFunction) and func.target == Target.CUDA: + return "__global__" + else: + return "FUNC_PREFIX" + def _symbol_decl(self, symb: PsSymbol): dtype = symb.get_dtype() diff --git a/src/pystencils/backend/extensions/foreign_ast.py b/src/pystencils/backend/extensions/foreign_ast.py index 55042ea835e827ebd8f4991cc13af3e5371f89ea..67362ce37e050558d16e06ebc6dbb6b30ff5c6e1 100644 --- a/src/pystencils/backend/extensions/foreign_ast.py +++ b/src/pystencils/backend/extensions/foreign_ast.py @@ -11,7 +11,7 @@ from ...types import PsType class PsForeignExpression(PsExpression, ABC): """Base class for foreign expressions. - + Foreign expressions are expressions whose properties are not modelled by the pystencils AST, and which pystencils therefore does not understand. @@ -24,9 +24,7 @@ class PsForeignExpression(PsExpression, ABC): __match_args__ = ("children",) - def __init__( - self, children: Iterable[PsExpression], dtype: PsType | None = None - ): + def __init__(self, children: Iterable[PsExpression], dtype: PsType | None = None): self._children = list(children) super().__init__(dtype) diff --git a/src/pystencils/backend/kernelcreation/__init__.py b/src/pystencils/backend/kernelcreation/__init__.py index 5de83caadb3b4aa50112ef2b65c28c1ca7932aae..abba9d9d8d571fa7540f82807d009e02d522849f 100644 --- a/src/pystencils/backend/kernelcreation/__init__.py +++ b/src/pystencils/backend/kernelcreation/__init__.py @@ -5,6 +5,7 @@ from .typification import Typifier from .ast_factory import AstFactory from .iteration_space import ( + IterationSpace, FullIterationSpace, SparseIterationSpace, create_full_iteration_space, @@ -19,6 +20,7 @@ __all__ = [ "FreezeExpressions", "Typifier", "AstFactory", + "IterationSpace", "FullIterationSpace", "SparseIterationSpace", "create_full_iteration_space", diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index 38aca4efef935bce161453359c486c5383eeddc7..c505864508997fd3c02f531f1b653762e448a403 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -208,7 +208,7 @@ class FullIterationSpace(IterationSpace): @property def archetype_field(self) -> Field | None: return self._archetype_field - + def dimensions_in_loop_order(self) -> Sequence[FullIterationSpace.Dimension]: """Return the dimensions of this iteration space ordered from the fastest to the slowest coordinate. @@ -220,7 +220,9 @@ class FullIterationSpace(IterationSpace): else: return self._dimensions - def actual_iterations(self, dimension: int | FullIterationSpace.Dimension | None = None) -> PsExpression: + def actual_iterations( + self, dimension: int | FullIterationSpace.Dimension | None = None + ) -> PsExpression: if dimension is None: return reduce( mul, (self.actual_iterations(d) for d in range(len(self.dimensions))) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index eb1576190fad13af01a8de2d352c79138f6b66ba..a389628368eb4f56dfb9de805c8a5cdb109cba27 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,22 +1,22 @@ from pystencils.backend.functions import CFunction, PsMathFunction -from pystencils.types.types import PsType -from .platform import Platform +from pystencils.backend.kernelcreation.context import KernelCreationContext +from pystencils.types import PsType, constify +from ..exceptions import MaterializationError +from .generic_gpu import GenericGpu, GpuThreadsRange -from ..kernelcreation.iteration_space import ( +from ..kernelcreation import ( + Typifier, IterationSpace, FullIterationSpace, - # SparseIterationSpace, + SparseIterationSpace, ) -from ..ast.structural import PsBlock, PsConditional -from ..ast.expressions import ( - PsExpression, - PsLiteralExpr, - PsAdd, -) +from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.expressions import PsExpression, PsLiteralExpr, PsCast from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType from ..literals import PsLiteral +from ...config import GpuIndexingConfig int32 = PsSignedIntegerType(width=32, const=False) @@ -34,7 +34,14 @@ GRID_DIM = [ ] -class CudaPlatform(Platform): +class CudaPlatform(GenericGpu): + + def __init__( + self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None + ) -> None: + super().__init__(ctx) + self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig() + self._typify = Typifier(ctx) @property def required_headers(self) -> set[str]: @@ -42,20 +49,13 @@ class CudaPlatform(Platform): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock: + ) -> tuple[PsBlock, GpuThreadsRange]: if isinstance(ispace, FullIterationSpace): - return self._guard_full_iteration_space(body, ispace) + return self._prepend_dense_translation(body, ispace) + elif isinstance(ispace, SparseIterationSpace): + return self._prepend_sparse_translation(body, ispace) else: - assert False, "unreachable code" - - def cuda_indices(self, dim): - block_size = BLOCK_DIM - indices = [ - block_index * bs + thread_idx - for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX) - ] - - return indices[:dim] + raise MaterializationError(f"Unknown type of iteration space: {ispace}") def select_function( self, math_function: PsMathFunction, dtype: PsType @@ -63,26 +63,57 @@ class CudaPlatform(Platform): raise NotImplementedError() # Internals - def _guard_full_iteration_space( + + def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> PsBlock: + ) -> tuple[PsBlock, GpuThreadsRange]: + dimensions = ispace.dimensions_in_loop_order() + launch_config = GpuThreadsRange.from_ispace(ispace) + + indexing_decls = [] + conds = [] + for i, dim in enumerate(dimensions[::-1]): + dim.counter.dtype = constify(dim.counter.get_dtype()) + + ctr = PsExpression.make(dim.counter) + indexing_decls.append( + self._typify( + PsDeclaration( + ctr, + dim.start + + dim.step + * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)), + ) + ) + ) + if not self._cfg.omit_range_check: + conds.append(PsLt(ctr, dim.stop)) + + if conds: + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) + else: + body.statements = indexing_decls + body.statements + ast = body - dimensions = ispace.dimensions + return ast, launch_config - # Determine loop order by permuting dimensions - archetype_field = ispace.archetype_field - if archetype_field is not None: - loop_order = archetype_field.layout - dimensions = [dimensions[coordinate] for coordinate in loop_order] + def _prepend_sparse_translation( + self, body: PsBlock, ispace: SparseIterationSpace + ) -> tuple[PsBlock, GpuThreadsRange]: + ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) - start = [ - PsAdd(c, d.start) - for c, d in zip(self.cuda_indices(len(dimensions)), dimensions[::-1]) - ] - conditions = [PsLt(c, d.stop) for c, d in zip(start, dimensions[::-1])] + ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._linear_thread_idx(0) + idx_decl = self._typify(PsDeclaration(ctr, PsCast(ctr.get_dtype(), thread_idx))) + body.statements = [idx_decl] + body.statements - condition: PsExpression = conditions[0] - for c in conditions[1:]: - condition = PsAnd(condition, c) + return body, GpuThreadsRange.from_ispace(ispace) - return PsBlock([PsConditional(condition, body)]) + def _linear_thread_idx(self, coord: int): + block_size = BLOCK_DIM[coord] + block_idx = BLOCK_IDX[coord] + thread_idx = THREAD_IDX[coord] + return block_idx * block_size + thread_idx diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 9ee33cc4e6e132be46df3fd9af6f32de240b2752..0eff7db02ce4f551ca3a68003d0021c393dc38a5 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -89,7 +89,7 @@ class SyclPlatform(GenericGpu): if conds: condition: PsExpression = conds[0] - for cond in conds: + for cond in conds[1:]: condition = PsAnd(condition, cond) ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) else: diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py index 276a0c44f4d1dcfcee330d87ca33a33f88c298bc..23c255ef0949e02ac5b0af57551ceec1bf6cfee2 100644 --- a/src/pystencils/enums.py +++ b/src/pystencils/enums.py @@ -71,13 +71,13 @@ class Target(Flag): found on the current machine and runtime environment. """ - GenericCUDA = _GPU | _CUDA + CUDA = _GPU | _CUDA """Generic CUDA GPU target. Generate a CUDA kernel for a generic Nvidia GPU. """ - GPU = GenericCUDA + GPU = CUDA """Alias for backward compatibility.""" SYCL = _GPU | _SYCL diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index 62f5d2968fa605244aee4a955fb7f9c2977819eb..f7d33b1deda68fb1715aad79e4d6ca3abe2563ad 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -86,16 +86,30 @@ def create_kernel( platform = GenericCpu(ctx) kernel_ast = platform.materialize_iteration_space(kernel_body, ispace) - case Target.SYCL: - from .backend.platforms import SyclPlatform - platform = SyclPlatform(ctx, config.gpu_indexing) + case target if target.is_gpu(): + match target: + case Target.SYCL: + from .backend.platforms import SyclPlatform + + platform = SyclPlatform(ctx, config.gpu_indexing) + case Target.CUDA: + from .backend.platforms import CudaPlatform + + platform = CudaPlatform(ctx, config.gpu_indexing) + case _: + raise NotImplementedError( + f"Code generation for target {target} not implemented" + ) + kernel_ast, gpu_threads = platform.materialize_iteration_space( kernel_body, ispace ) + case _: - # TODO: CUDA/HIP platform - raise NotImplementedError("Target platform not implemented") + raise NotImplementedError( + f"Code generation for target {target} not implemented" + ) # Simplifying transformations elim_constants = EliminateConstants(ctx, extract_constant_exprs=True)