Format gpu_indexing.py

c427c469 · Richard Angersbach · 2624d819 · c427c469
Commit c427c469 authored 6 months ago by Richard Angersbach
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -66,14 +66,17 @@ class GpuLaunchConfiguration(ABC):

    @staticmethod
    def block_size_exceeds_hw_limits(
-            block_size: tuple[int, ...],
-            hw_props: HardwareProperties) -> bool:
+        block_size: tuple[int, ...], hw_props: HardwareProperties
+    ) -> bool:
        """Checks if provided block size conforms limits given by the hardware."""

-        return any(
+        return (
+            any(
                size > max_size
                for size, max_size in zip(block_size, hw_props.max_block_sizes)
-        ) or prod(block_size) > hw_props.max_threads_per_block
+            )
+            or prod(block_size) > hw_props.max_threads_per_block
+        )

    @staticmethod
    def _gen_error_msg(block_size: tuple[int, ...]):
@@ -81,15 +84,15 @@ class GpuLaunchConfiguration(ABC):
        Final block size was too large: {block_size}."

    @staticmethod
-    def _round_block_sizes_to_warp_size(to_round: list[int], warp_size: int) -> tuple[int, ...]:
+    def _round_block_sizes_to_warp_size(
+        to_round: list[int], warp_size: int
+    ) -> tuple[int, ...]:
        # check if already aligns with warp size
        if prod(to_round) % warp_size == 0:
            return tuple(to_round)

        # find index of element closest to warp size and round up
-        index_to_round = to_round.index(
-            max(to_round, key=lambda i: abs(i % warp_size))
-        )
+        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
        if index_to_round + 1 < len(to_round):
            return (
                *to_round[:index_to_round],
@@ -152,7 +155,9 @@ class GpuLaunchConfiguration(ABC):

            # case 2: trimmed block is equivalent to the whole iteration space
            if all(b == i for b, i in zip(trimmed, it_space)):
-                return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
+                return check_sizes_and_return(
+                    cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size)
+                )
            else:
                # double block size in each dimension until block is large enough (or case 2 triggers)
                for d in resize_order:
@@ -175,7 +180,11 @@ class GpuLaunchConfiguration(ABC):

                    # case 3: trim block is large enough
                    if prod(trimmed) >= hw_props.warp_size:
-                        return check_sizes_and_return(cls._round_block_sizes_to_warp_size(trimmed, hw_props.warp_size))
+                        return check_sizes_and_return(
+                            cls._round_block_sizes_to_warp_size(
+                                trimmed, hw_props.warp_size
+                            )
+                        )

        raise CodegenError("Unable to determine GPU block size for this kernel.")

@@ -211,8 +220,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
        # -> round block size in fastest moving dimension up to multiple of warp size
        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
        block_size = (
+            (
                ceil_to_multiple(block_size[0], self._hw_props.warp_size)
-            if self._assume_warp_aligned_block_size else block_size[0],
+                if self._assume_warp_aligned_block_size
+                else block_size[0]
+            ),
            *block_size[1:],
        )

@@ -233,9 +245,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
    """

    def __init__(
-        self,
-        hw_props: HardwareProperties,
-        assume_warp_aligned_block_size: bool = False
+        self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False
    ) -> None:
        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size

@@ -271,9 +281,14 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
        if self._grid_size is None:
            raise AttributeError("No GPU grid size was set by the user.")

-        if self._assume_warp_aligned_block_size and prod(self._block_size) % self._hw_props.warp_size != 0:
-            raise CodegenError("Specified block sizes must align with warp size with "
-                               "`assume_warp_aligned_block_size` enabled.")
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(self._block_size) % self._hw_props.warp_size != 0
+        ):
+            raise CodegenError(
+                "Specified block sizes must align with warp size with "
+                "`assume_warp_aligned_block_size` enabled."
+            )

        return self._block_size, self._grid_size

@@ -347,17 +362,26 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
        computed_block_size: tuple[int, ...]
        try:
            if self._use_block_size_fitting:
-                computed_block_size = self.fit_block_size(num_work_items, self._block_size, self._hw_props)
+                computed_block_size = self.fit_block_size(
+                    num_work_items, self._block_size, self._hw_props
+                )
            else:
-                computed_block_size = self.trim_block_size(num_work_items, self._block_size, self._hw_props)
+                computed_block_size = self.trim_block_size(
+                    num_work_items, self._block_size, self._hw_props
+                )

            # check if assumption for warp size alignment is met
-            if self._assume_warp_aligned_block_size and prod(computed_block_size) % self._hw_props.warp_size != 0:
+            if (
+                self._assume_warp_aligned_block_size
+                and prod(computed_block_size) % self._hw_props.warp_size != 0
+            ):
                raise CodegenError("Adapted block size is not divisible by warp size.")
        except CodegenError as e:
-            warn(f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+            warn(
+                f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
                f"Block size fitting could not determine optimal block size configuration. "
-                 f"Defaulting back to {self._block_size}")
+                f"Defaulting back to {self._block_size}"
+            )
            computed_block_size = self._block_size

        adapted_block_size = cast(dim3, computed_block_size)
@@ -457,8 +481,11 @@ class GpuIndexing:
    def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
        """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
        if self._manual_launch_grid:
+
            def factory():
-                return ManualLaunchConfiguration(self._hw_props, self._assume_warp_aligned_block_size)
+                return ManualLaunchConfiguration(
+                    self._hw_props, self._assume_warp_aligned_block_size
+                )

            return factory

@@ -481,8 +508,7 @@ class GpuIndexing:
            )

        work_items_expr += tuple(
-            self._ast_factory.parse_index(1)
-            for _ in range(3 - rank)
+            self._ast_factory.parse_index(1) for _ in range(3 - rank)
        )

        num_work_items = cast(
@@ -545,7 +571,7 @@ class GpuIndexing:
                block_size,
                cast(_Dim3Lambda, grid_size),
                self._hw_props,
-                self._assume_warp_aligned_block_size
+                self._assume_warp_aligned_block_size,
            )

        return factory