Skip to content
Snippets Groups Projects
Commit 7c371fb4 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

make no assumptions about warp_size for Target.HIP

parent 9e729903
No related branches found
No related tags found
1 merge request!458HIP Target and Platform
Pipeline #76708 canceled
from __future__ import annotations from __future__ import annotations
from typing import cast, Sequence, Callable, TYPE_CHECKING from typing import cast, Sequence, Callable, TYPE_CHECKING
from dataclasses import dataclass, replace from dataclasses import dataclass, replace
from warnings import warn
from .target import Target from .target import Target
from .config import ( from .config import (
...@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver: ...@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver:
if warp_size is None: if warp_size is None:
warp_size = GpuOptions.default_warp_size(self._target) warp_size = GpuOptions.default_warp_size(self._target)
# TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True if warp_size is None and assume_warp_aligned_block_size:
warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
return GpuIndexing( return GpuIndexing(
self._ctx, self._ctx,
......
...@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda] ...@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
@dataclass @dataclass
class HardwareProperties: class HardwareProperties:
warp_size: int warp_size: int | None
max_threads_per_block: int max_threads_per_block: int
max_block_sizes: dim3 max_block_sizes: dim3
...@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): ...@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
if ( if (
self._assume_warp_aligned_block_size self._assume_warp_aligned_block_size
and self._hw_props.warp_size is not None
and prod(self._block_size) % self._hw_props.warp_size != 0 and prod(self._block_size) % self._hw_props.warp_size != 0
): ):
raise CodegenError( raise CodegenError(
...@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ...@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
return ( return (
*to_round[:index_to_round], *to_round[:index_to_round],
ceil_to_multiple(to_round[index_to_round], warp_size), ceil_to_multiple(to_round[index_to_round], warp_size),
*to_round[index_to_round + 1 :], *to_round[index_to_round + 1:],
) )
else: else:
return ( return (
...@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ...@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
if ( if (
self._assume_warp_aligned_block_size self._assume_warp_aligned_block_size
and prod(ret) % self._hw_props.warp_size != 0 and hw_props.warp_size is not None
and prod(ret) % hw_props.warp_size != 0
): ):
self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
...@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): ...@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
return ret return ret
trimmed = trim(list(block_size)) trimmed = trim(list(block_size))
if hw_props.warp_size is None:
return tuple(trimmed)
if ( if (
prod(trimmed) >= hw_props.warp_size prod(trimmed) >= hw_props.warp_size
and prod(trimmed) % hw_props.warp_size == 0 and prod(trimmed) % hw_props.warp_size == 0
...@@ -493,14 +499,13 @@ class GpuIndexing: ...@@ -493,14 +499,13 @@ class GpuIndexing:
ctx: KernelCreationContext, ctx: KernelCreationContext,
target: Target, target: Target,
scheme: GpuIndexingScheme, scheme: GpuIndexingScheme,
warp_size: int, warp_size: int | None,
manual_launch_grid: bool = False, manual_launch_grid: bool = False,
assume_warp_aligned_block_size: bool = False, assume_warp_aligned_block_size: bool = False,
) -> None: ) -> None:
self._ctx = ctx self._ctx = ctx
self._target = target self._target = target
self._scheme = scheme self._scheme = scheme
self._warp_size = warp_size
self._manual_launch_grid = manual_launch_grid self._manual_launch_grid = manual_launch_grid
self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
...@@ -608,7 +613,10 @@ class GpuIndexing: ...@@ -608,7 +613,10 @@ class GpuIndexing:
# impossible to use block size determination function since the iteration space is unknown # impossible to use block size determination function since the iteration space is unknown
# -> round block size in fastest moving dimension up to multiple of warp size # -> round block size in fastest moving dimension up to multiple of warp size
rounded_block_size: PsExpression rounded_block_size: PsExpression
if self._assume_warp_aligned_block_size: if (
self._assume_warp_aligned_block_size
and self._hw_props.warp_size is not None
):
warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
rounded_block_size = self._ast_factory.parse_index( rounded_block_size = self._ast_factory.parse_index(
PsIntDiv( PsIntDiv(
......
...@@ -145,8 +145,9 @@ def test_block_size_adaptations( ...@@ -145,8 +145,9 @@ def test_block_size_adaptations(
cfg = CreateKernelConfig(target=target) cfg = CreateKernelConfig(target=target)
cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.indexing_scheme = "linear3d"
cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
warp_size = cfg.gpu.default_warp_size(cfg.get_target()) warp_size = 32
cfg.gpu.warp_size = warp_size
ast = create_kernel(asm, cfg) ast = create_kernel(asm, cfg)
kernel = ast.compile() kernel = ast.compile()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment