Skip to content
Snippets Groups Projects
Commit 7c371fb4 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

make no assumptions about warp_size for Target.HIP

parent 9e729903
No related branches found
No related tags found
1 merge request!458HIP Target and Platform
Pipeline #76708 canceled
from __future__ import annotations
from typing import cast, Sequence, Callable, TYPE_CHECKING
from dataclasses import dataclass, replace
from warnings import warn
from .target import Target
from .config import (
......@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver:
if warp_size is None:
warp_size = GpuOptions.default_warp_size(self._target)
# TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True
if warp_size is None and assume_warp_aligned_block_size:
warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
return GpuIndexing(
self._ctx,
......
......@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
@dataclass
class HardwareProperties:
warp_size: int
warp_size: int | None
max_threads_per_block: int
max_block_sizes: dim3
......@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
if (
self._assume_warp_aligned_block_size
and self._hw_props.warp_size is not None
and prod(self._block_size) % self._hw_props.warp_size != 0
):
raise CodegenError(
......@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
return (
*to_round[:index_to_round],
ceil_to_multiple(to_round[index_to_round], warp_size),
*to_round[index_to_round + 1 :],
*to_round[index_to_round + 1:],
)
else:
return (
......@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
if (
self._assume_warp_aligned_block_size
and prod(ret) % self._hw_props.warp_size != 0
and hw_props.warp_size is not None
and prod(ret) % hw_props.warp_size != 0
):
self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
......@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
return ret
trimmed = trim(list(block_size))
if hw_props.warp_size is None:
return tuple(trimmed)
if (
prod(trimmed) >= hw_props.warp_size
and prod(trimmed) % hw_props.warp_size == 0
......@@ -493,14 +499,13 @@ class GpuIndexing:
ctx: KernelCreationContext,
target: Target,
scheme: GpuIndexingScheme,
warp_size: int,
warp_size: int | None,
manual_launch_grid: bool = False,
assume_warp_aligned_block_size: bool = False,
) -> None:
self._ctx = ctx
self._target = target
self._scheme = scheme
self._warp_size = warp_size
self._manual_launch_grid = manual_launch_grid
self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
......@@ -608,7 +613,10 @@ class GpuIndexing:
# impossible to use block size determination function since the iteration space is unknown
# -> round block size in fastest moving dimension up to multiple of warp size
rounded_block_size: PsExpression
if self._assume_warp_aligned_block_size:
if (
self._assume_warp_aligned_block_size
and self._hw_props.warp_size is not None
):
warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
rounded_block_size = self._ast_factory.parse_index(
PsIntDiv(
......
......@@ -145,8 +145,9 @@ def test_block_size_adaptations(
cfg = CreateKernelConfig(target=target)
cfg.gpu.indexing_scheme = "linear3d"
cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
warp_size = cfg.gpu.default_warp_size(cfg.get_target())
warp_size = 32
cfg.gpu.warp_size = warp_size
ast = create_kernel(asm, cfg)
kernel = ast.compile()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment