Skip to content
Snippets Groups Projects
Commit c427c469 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Format gpu_indexing.py

parent 2624d819
No related branches found
No related tags found
1 merge request!454Optimization for GPU block size determination
Pipeline #75759 failed
......@@ -66,14 +66,17 @@ class GpuLaunchConfiguration(ABC):
@staticmethod
def block_size_exceeds_hw_limits(
block_size: tuple[int, ...],
hw_props: HardwareProperties) -> bool:
block_size: tuple[int, ...], hw_props: HardwareProperties
) -> bool:
"""Checks if provided block size conforms limits given by the hardware."""
return any(
return (
any(
size > max_size
for size, max_size in zip(block_size, hw_props.max_block_sizes)
) or prod(block_size) > hw_props.max_threads_per_block
)
or prod(block_size) > hw_props.max_threads_per_block
)
@staticmethod
def _gen_error_msg(block_size: tuple[int, ...]):
......@@ -81,15 +84,15 @@ class GpuLaunchConfiguration(ABC):
Final block size was too large: {block_size}."
@staticmethod
def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]:
def _round_block_sizes_to_warp_size(
to_round: list[int], warp_size: int
) -> tuple[int, ...]:
# check if already aligns with warp size
if prod(to_round) % warp_size == 0:
return tuple(to_round)
# find index of element closest to warp size and round up
index_to_round = to_round.index(
max(to_round, key=lambda i: abs(i % warp_size))
)
index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
if index_to_round + 1 < len(to_round):
return (
*to_round[:index_to_round],
......@@ -152,7 +155,9 @@ class GpuLaunchConfiguration(ABC):
# case 2: trimmed block is equivalent to the whole iteration space
if all(b == i for b, i in zip(trimmed, it_space)):
return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
return check_sizes_and_return(
cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)
)
else:
# double block size in each dimension until block is large enough (or case 2 triggers)
for d in resize_order:
......@@ -175,7 +180,11 @@ class GpuLaunchConfiguration(ABC):
# case 3: trim block is large enough
if prod(trimmed) >= hw_props.warp_size:
return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
return check_sizes_and_return(
cls._round_block_sizes_to_warp_size(
trimmed, hw_props.warp_size
)
)
raise CodegenError("Unable to determine GPU block size for this kernel.")
......@@ -211,8 +220,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
# -> round block size in fastest moving dimension up to multiple of warp size
block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
block_size = (
(
ceil_to_multiple(block_size[0], self._hw_props.warp_size)
if self._assume_warp_aligned_block_size else block_size[0],
if self._assume_warp_aligned_block_size
else block_size[0]
),
*block_size[1:],
)
......@@ -233,9 +245,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
"""
def __init__(
self,
hw_props: HardwareProperties,
assume_warp_aligned_block_size: bool = False
self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False
) -> None:
self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
......@@ -271,9 +281,14 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
if self._grid_size is None:
raise AttributeError("No GPU grid size was set by the user.")
if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0:
raise CodegenError("Specified block sizes must align with warp size with "
"`assume_warp_aligned_block_size` enabled.")
if (
self._assume_warp_aligned_block_size
and prod(self._block_size) % self._hw_props.warp_size != 0
):
raise CodegenError(
"Specified block sizes must align with warp size with "
"`assume_warp_aligned_block_size` enabled."
)
return self._block_size, self._grid_size
......@@ -347,17 +362,26 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
computed_block_size: tuple[int, ...]
try:
if self._use_block_size_fitting:
computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props)
computed_block_size = self.fit_block_size(
num_work_items, self._block_size, self._hw_props
)
else:
computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props)
computed_block_size = self.trim_block_size(
num_work_items, self._block_size, self._hw_props
)
# check if assumption for warp size alignment is met
if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0:
if (
self._assume_warp_aligned_block_size
and prod(computed_block_size) % self._hw_props.warp_size != 0
):
raise CodegenError("Adapted block size is not divisible by warp size.")
except CodegenError as e:
warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
warn(
f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
f"Block size fitting could not determine optimal block size configuration. "
f"Defaulting back to {self._block_size}")
f"Defaulting back to {self._block_size}"
)
computed_block_size = self._block_size
adapted_block_size = cast(dim3, computed_block_size)
......@@ -457,8 +481,11 @@ class GpuIndexing:
def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
"""Retrieve a factory for the launch configuration for later consumption by the runtime system"""
if self._manual_launch_grid:
def factory():
return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size)
return ManualLaunchConfiguration(
self._hw_props, self._assume_warp_aligned_block_size
)
return factory
......@@ -481,8 +508,7 @@ class GpuIndexing:
)
work_items_expr += tuple(
self._ast_factory.parse_index(1)
for _ in range(3 - rank)
self._ast_factory.parse_index(1) for _ in range(3 - rank)
)
num_work_items = cast(
......@@ -545,7 +571,7 @@ class GpuIndexing:
block_size,
cast(_Dim3Lambda, grid_size),
self._hw_props,
self._assume_warp_aligned_block_size
self._assume_warp_aligned_block_size,
)
return factory
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment