diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 7a5074677433830133fef4c2f0f31761b9014284..8c3cd45faf2a210bf8de16e3f25d9ba7897d4e0f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,32 @@ from __future__ import annotations +import math + from .generic_gpu import GenericGpu -from ..ast.expressions import PsExpression, PsLiteralExpr +from ..ast import PsAstNode +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCall, + PsAnd, + PsConstantExpr, + PsSymbolExpr, +) +from ..ast.structural import ( + PsConditional, + PsStatement, + PsAssignment, + PsBlock, + PsStructuralNode, +) +from ..constants import PsConstant from ..exceptions import MaterializationError -from ..functions import NumericLimitsFunctions +from ..functions import NumericLimitsFunctions, CFunction from ..literals import PsLiteral -from ...types import PsType, PsIeeeFloatType +from ...reduction_op_mapping import reduction_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType +from ...types.quick import SInt, UInt class CudaPlatform(GenericGpu): @@ -17,7 +38,92 @@ class CudaPlatform(GenericGpu): '"npp.h"', } - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match reduction_op: + case ReductionOp.Sub: + actual_reduction_op = ReductionOp.Add + case _: + actual_reduction_op = reduction_op + + # check if thread is valid for performing reduction + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + + cond: PsExpression + shuffles: tuple[PsAssignment, ...] + if self._warp_size and self._assume_warp_aligned_block_size: + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall( + CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) + + # set up shuffle instructions for warp-level reduction + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple( + PsAssignment( + symbol_expr, + reduction_op_to_expr( + actual_reduction_op, + symbol_expr, + gen_shuffle_instr(pow(2, i - 1)), + ), + ) + for i in reversed(range(1, num_shuffles)) + ) + + # find first thread in warp + first_thread_in_warp = self._first_thread_in_warp(ispace) + + # set condition to only execute atomic operation on first valid thread in warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) + else: + # no optimization: only execute atomic add on valid thread + shuffles = () + cond = is_valid_thread + + # use atomic operation + func = CFunction( + f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void") + ) + func_args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional( + cond, PsBlock([PsStatement(PsCall(func, func_args))]) + ) + + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) match func: diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 349e79d4b6a727e84d9f2ec6e941c231ad3fcb81..8a4dd11a29c3243bc1cbad2a857646a1a7be038e 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,16 +1,19 @@ from __future__ import annotations -import math import operator from abc import ABC, abstractmethod from functools import reduce from ..ast import PsAstNode from ..constants import PsConstant -from ...reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.reduction import ReductionOp -from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType -from ...types.quick import UInt, SInt +from ...types import ( + constify, + deconstify, + PsScalarType, + PsType, +) +from ...types.quick import SInt from ..exceptions import MaterializationError from .platform import Platform @@ -28,8 +31,6 @@ from ..ast.structural import ( PsBlock, PsConditional, PsDeclaration, - PsStatement, - PsAssignment, PsStructuralNode, ) from ..ast.expressions import ( @@ -39,7 +40,6 @@ from ..ast.expressions import ( PsCall, PsLookup, PsBufferAcc, - PsSymbolExpr, PsConstantExpr, PsAdd, PsRem, @@ -206,7 +206,18 @@ class GenericGpu(Platform): } @abstractmethod - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: + pass + + @abstractmethod + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: pass def __init__( @@ -262,6 +273,31 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") + @staticmethod + def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: + """Returns thread indices multiplied with block dimension strides per dimension.""" + + return tuple( + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) + ) + + def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression: + """Returns expression that determines whether a thread is the first within a warp.""" + + tids_per_dim = GenericGpu._thread_index_per_dim(ispace) + tid: PsExpression = tids_per_dim[0] + for t in tids_per_dim[1:]: + tid = PsAdd(tid, t) + + return PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) + def select_function( self, call: PsCall ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: @@ -276,97 +312,8 @@ class GenericGpu(Platform): ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op - stype = symbol_expr.dtype - ptrtype = ptr_expr.dtype - - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( - ptrtype, PsPointerType - ) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( - stype, PsScalarType - ) - - if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError( - "atomic operations are only available for float32/64 datatypes" - ) - - # workaround for subtractions -> use additions for reducing intermediate results - # similar to OpenMP reductions: local copies (negative sign) are added at the end - match op: - case ReductionOp.Sub: - actual_op = ReductionOp.Add - case _: - actual_op = op - - # check if thread is valid for performing reduction - ispace = self._ctx.get_iteration_space() - is_valid_thread = self._get_condition_for_translation(ispace) - - cond: PsExpression - shuffles: tuple[PsAssignment, ...] - if self._warp_size and self._assume_warp_aligned_block_size: - # perform local warp reductions - def gen_shuffle_instr(offset: int): - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall( - CFunction( - "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype - ), - [ - full_mask, - symbol_expr, - PsConstantExpr(PsConstant(offset, SInt(32))), - ], - ) - - # set up shuffle instructions for warp-level reduction - num_shuffles = math.frexp(self._warp_size)[1] - shuffles = tuple( - PsAssignment( - symbol_expr, - reduction_op_to_expr( - actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)) - ), - ) - for i in reversed(range(1, num_shuffles)) - ) - - # find first thread in warp - thread_indices_per_dim = [ - idx - * PsConstantExpr( - PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) - ) - for i, idx in enumerate(THREAD_IDX[: ispace.rank]) - ] - tid: PsExpression = thread_indices_per_dim[0] - for t in thread_indices_per_dim[1:]: - tid = PsAdd(tid, t) - first_thread_in_warp = PsEq( - PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32))), - ) - - # set condition to only execute atomic operation on first valid thread in warp - cond = ( - PsAnd(is_valid_thread, first_thread_in_warp) - if is_valid_thread - else first_thread_in_warp - ) - else: - # no optimization: only execute atomic add on valid thread - shuffles = () - cond = is_valid_thread - - # use atomic operation - call.function = CFunction( - f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void") - ) - call.args = (ptr_expr, symbol_expr) - # assemble warp reduction - return shuffles, PsConditional(cond, PsBlock([PsStatement(call)])) + return self.resolve_reduction(ptr_expr, symbol_expr, op) dtype = call.get_dtype() arg_types = (dtype,) * func.num_args diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index 45d60452bf9b560d1c55230f7c11906026db164b..404d9bb27bf7eded15b9e2f7dc60fb70ceedb1fe 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -1,9 +1,13 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast import PsAstNode from ..ast.expressions import PsExpression, PsLiteralExpr +from ..ast.structural import PsStructuralNode +from ..exceptions import MaterializationError from ..functions import NumericLimitsFunctions from ..literals import PsLiteral +from ...sympyextensions import ReductionOp from ...types import PsType, PsIeeeFloatType @@ -12,12 +16,11 @@ class HipPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return super().required_headers | { - '"pystencils_runtime/hip.h"', - "<limits>" - } + return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"} - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) return PsLiteralExpr( @@ -26,3 +29,12 @@ class HipPlatform(GenericGpu): dtype, ) ) + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + + raise MaterializationError("Reductions are yet not supported in HIP backend.") diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h index cd4bd5730657930f7a2375f74df0f5691ebaaf15..bd1889153a56876032781b77dcb66a413b3fe07d 100644 --- a/src/pystencils/include/simd_horizontal_helpers.h +++ b/src/pystencils/include/simd_horizontal_helpers.h @@ -120,7 +120,7 @@ inline float _mm256_horizontal_max_ps(float dst, __m256 src) { #endif -#if defined(__AVX512VL__) +#if defined(__AVX512F__) #include <immintrin.h> inline double _mm512_horizontal_add_pd(double dst, __m512d src) { diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py index aebbf35bb65dd9fd66374a98a038ed3c4a3151db..1d652c6e1f53f5a7b060f2a725162ef07f1fdfae 100644 --- a/util/generate_simd_horizontal_op.py +++ b/util/generate_simd_horizontal_op.py @@ -277,7 +277,7 @@ vtypes_for_instruction_set = { guards_for_instruction_sets = { InstructionSets.SSE3: "__SSE3__", InstructionSets.AVX: "__AVX__", - InstructionSets.AVX512: '__AVX512VL__', + InstructionSets.AVX512: '__AVX512F__', InstructionSets.NEON: '_M_ARM64', }