diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 1082669e640c989cad0b994ac70818f22a3517bb..e5035d7c7b4f2f21699f04b3cc0a7245a7ba2ff4 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -72,6 +72,13 @@ These depend on the type of the launch configuration:
 while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
 parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
+The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size
+or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or 
+`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it.
+The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes 
+the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts 
+the initial block size by incrementally enlarging the trimmed block size until it is large enough 
+and aligns with the warp size.
 
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 610c61ddf647331d7b77b06968e489b4dcc76293..48a24e703584e8c87365550c7a7d5d8338feb717 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -111,24 +111,49 @@ For most kernels with an at most three-dimensional iteration space,
 this behavior is sufficient and desired.
 It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
 
-If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places.
-The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
-code generator option;
-if none is specified, a default depending on the iteration space's dimensionality will be used.
-
-The block size can furthermore be modified at the compiled kernel's wrapper object via the
-`launch_config.block_size` attribute:
+The GPU thread block size of a compiled kernel's wrapper object can only be directly modified
+for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids).
+Linear indexing schemes without manual launch configurations either employ default block sizes
+or compute block sizes using user-exposed member functions that adapt the initial block size that was
+passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`.
+The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space.
+The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is
+divisible by the hardware's warp size.
 
 ```{code-cell} ipython3
+:tags: [raises-exception]
 kfunc = kernel.compile()
-kfunc.launch_config.block_size = (256, 2, 1)
 
-# Run the kernel
+# three different configuration cases for block size:
+
+# a) Nothing
+# -> use default block size, perform no fitting, no trimming
+
+# b) Activate fitting with initial block size
+kfunc.launch_config.fit_block_size((8, 8, 4))
+
+# c) Activate trimming with initial block size
+kfunc.launch_config.trim_block_size((8, 8, 4))
+
+# finally run the kernel...
 kfunc(f=f_arr, g=g_arr)
 ```
 
+Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources.
+Even though such block size configurations are not enforced, notifying our code generator via the 
+`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows 
+for further optimization potential, e.g. for warp-level reductions.
+When setting this option to `True`, the user has to make sure that this alignment applies. 
+For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable 
+block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed 
+by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing
+block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, 
+the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple
+of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error
+is thrown in case these limits are exceeded.
+
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
-and the given thread block size.
+and the final thread block size.
 
 :::{attention}
 
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index d630594ba8a633e04e2b058e74f77d604f6d95a9..c97fad413198c16b87d6c27949e70d6aa59a3aa1 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -117,7 +117,7 @@ class Blockwise4DMapping(ThreadMapping):
         THREAD_IDX[0],
         BLOCK_IDX[0],
         BLOCK_IDX[1],
-        BLOCK_IDX[2]
+        BLOCK_IDX[2],
     ]
 
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -163,23 +163,19 @@ class Blockwise4DMapping(ThreadMapping):
 
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs.
-    
+
     Args:
         ctx: The kernel creation context
-        omit_range_check: If `True`, generated index translation code will not check if the point identified
-            by block and thread indices is actually contained in the iteration space
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
     """
 
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
@@ -282,19 +278,12 @@ class CudaPlatform(GenericGpu):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr_expr, dim.stop))
-
-        if conds:
-            condition: PsExpression = conds[0]
-            for cond in conds[1:]:
-                condition = PsAnd(condition, cond)
-            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-        else:
-            body.statements = indexing_decls + body.statements
-            ast = body
+            conds.append(PsLt(ctr_expr, dim.stop))
 
-        return ast
+        condition: PsExpression = conds[0]
+        for cond in conds[1:]:
+            condition = PsAnd(condition, cond)
+        return PsBlock(indexing_decls + [PsConditional(condition, body)])
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -324,12 +313,6 @@ class CudaPlatform(GenericGpu):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr_expr.clone(), stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
-
-        return ast
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr_expr.clone(), stop)
+        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index f3c4bb3d5559fa5b418b3f3ca51b3cf43aa99b2f..d16c4f51b0609e50b1047c8f2fbd3824b8630b18 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -35,12 +35,10 @@ class SyclPlatform(Platform):
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._automatic_block_size = automatic_block_size
 
     @property
@@ -136,8 +134,7 @@ class SyclPlatform(Platform):
             indexing_decls.append(
                 PsDeclaration(ctr, dim.start + work_item_idx * dim.step)
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+            conds.append(PsLt(ctr, dim.stop))
 
         if conds:
             condition: PsExpression = conds[0]
@@ -182,15 +179,9 @@ class SyclPlatform(Platform):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr, stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
-
-        return ast
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr, stop)
+        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 0d43b40e35b9e90b1f13e2ba7bc279274f466859..d6f8e403cdce0c99853ee1127567e11c14b92b32 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -372,30 +372,38 @@ class GpuOptions(ConfigBase):
     indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
     """Thread indexing scheme for dense GPU kernels."""
 
-    omit_range_check: BasicOption[bool] = BasicOption(False)
-    """If set to `True`, omit the iteration counter range check.
+    manual_launch_grid: BasicOption[bool] = BasicOption(False)
+    """Always require a manually specified launch grid when running this kernel.
     
-    By default, the code generator introduces a check if the iteration counters computed from GPU block and thread
-    indices are within the prescribed loop range.
-    This check can be discarded through this option, at your own peril.
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
     """
 
-    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels.
+    warp_size: BasicOption[int] = BasicOption()
+    """Specifies the size of a warp (CUDA) or wavefront (HIP).
     
-    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
-    is chosen as an indexing scheme.
-    The block size may be overridden at runtime.
+    If this option is not set the default value for the given target will be automatically used.
     """
 
-    manual_launch_grid: BasicOption[bool] = BasicOption(False)
-    """Always require a manually specified launch grid when running this kernel.
+    assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False)
+    """Specifies whether block sizes are divisible by the hardware's warp size.
     
-    If set to `True`, the code generator will not attempt to infer the size of
-    the launch grid from the kernel.
-    The launch grid will then have to be specified manually at runtime.
+    If set to `True`, the code generator can employ optimizations that require this assumption, 
+    e.g. warp-level reductions.
+    The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion.
     """
 
+    @staticmethod
+    def default_warp_size(target: Target):
+        match target:
+            case Target.CUDA:
+                return 32
+            case _:
+                raise NotImplementedError(
+                    f"No default warp/wavefront size known for target {target}"
+                )
+
     @indexing_scheme.validate
     def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
         if isinstance(val, GpuIndexingScheme):
@@ -601,7 +609,7 @@ class CreateKernelConfig(ConfigBase):
             elif self.get_target() == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-                    
+
                     return CupyJit()
 
                 except ImportError:
@@ -734,9 +742,7 @@ class CreateKernelConfig(ConfigBase):
                 UserWarning,
             )
 
-            self.gpu = GpuOptions(
-                block_size=gpu_indexing_params.get("block_size", None)
-            )
+            self.gpu = GpuOptions()
 
 
 def _deprecated_option(name, instead):  # pragma: no cover
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index b8f9c71015765e638ddca22278dfa15c0e5bcaa1..65352f5205125b05acbd088bc42e9205cd7eb415 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -11,6 +11,7 @@ from .config import (
     GhostLayerSpec,
     IterationSliceSpec,
     GpuIndexingScheme,
+    GpuOptions,
 )
 from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
@@ -401,13 +402,22 @@ class DefaultKernelCreationDriver:
         if self._target != Target.CUDA:
             return None
 
-        from .gpu_indexing import dim3
-
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
-        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+        warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
-        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
+        if warp_size is None:
+            warp_size = GpuOptions.default_warp_size(self._target)
+
+        return GpuIndexing(
+            self._ctx,
+            self._target,
+            idx_scheme,
+            warp_size,
+            manual_launch_grid,
+            assume_warp_aligned_block_size,
+        )
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
@@ -437,9 +447,6 @@ class DefaultKernelCreationDriver:
                 )
 
         elif self._target.is_gpu():
-            gpu_opts = self._cfg.gpu
-            omit_range_check: bool = gpu_opts.get_option("omit_range_check")
-
             match self._target:
                 case Target.CUDA:
                     from ..backend.platforms import CudaPlatform
@@ -452,18 +459,15 @@ class DefaultKernelCreationDriver:
 
                     return CudaPlatform(
                         self._ctx,
-                        omit_range_check=omit_range_check,
                         thread_mapping=thread_mapping,
                     )
         elif self._target == Target.SYCL:
             from ..backend.platforms import SyclPlatform
 
             auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
-            omit_range_check = self._cfg.gpu.get_option("omit_range_check")
 
             return SyclPlatform(
                 self._ctx,
-                omit_range_check=omit_range_check,
                 automatic_block_size=auto_block_size,
             )
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 27d6fc817d5a9193c3faa4b170d907987fe6022e..d473e5b4a7408d2be77f0c4f2bc3b55a5ca9df0a 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import cast, Any, Callable
 from itertools import chain
+from warnings import warn
 
 from .functions import Lambda
 from .parameters import Parameter
 from .errors import CodegenError
-from .config import GpuIndexingScheme, _AUTO_TYPE
+from .config import GpuIndexingScheme
+from .target import Target
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -16,13 +19,36 @@ from ..backend.kernelcreation import (
 )
 from ..backend.platforms.cuda import ThreadMapping
 
-from ..backend.ast.expressions import PsExpression
+from ..backend.ast.expressions import PsExpression, PsIntDiv
+from math import prod
 
+from ..utils import ceil_to_multiple
 
 dim3 = tuple[int, int, int]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
+@dataclass
+class HardwareProperties:
+    warp_size: int
+    max_threads_per_block: int
+    max_block_sizes: dim3
+
+    def block_size_exceeds_hw_limits(
+            self,
+            block_size: tuple[int, ...]
+    ) -> bool:
+        """Checks if provided block size conforms limits given by the hardware."""
+
+        return (
+            any(
+                size > max_size
+                for size, max_size in zip(block_size, self.max_block_sizes)
+            )
+            or prod(block_size) > self.max_threads_per_block
+        )
+
+
 class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
 
@@ -33,6 +59,18 @@ class GpuLaunchConfiguration(ABC):
             parameters to the associated kernel
     """
 
+    @property
+    @abstractmethod
+    def block_size(self) -> dim3 | None:
+        """Returns desired block size if available."""
+        pass
+
+    @block_size.setter
+    @abstractmethod
+    def block_size(self, val: dim3):
+        """Sets desired block size if possible."""
+        pass
+
     @property
     @abstractmethod
     def parameters(self) -> frozenset[Parameter]:
@@ -52,6 +90,25 @@ class GpuLaunchConfiguration(ABC):
         this launch configuration, such that when the configuration changes, the JIT parameter
         cache is invalidated."""
 
+    @staticmethod
+    def get_default_block_size(rank: int) -> dim3:
+        """Returns the default block size configuration used by the generator."""
+
+        match rank:
+            case 1:
+                return (256, 1, 1)
+            case 2:
+                return (16, 16, 1)
+            case 3:
+                return (8, 8, 4)
+            case _:
+                assert False, "unreachable code"
+
+    @staticmethod
+    def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
+        return f"Unable to determine GPU block size for this kernel. \
+        Final block size was too large: {block_size}."
+
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
@@ -63,20 +120,37 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
+        hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool,
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
+        self._hw_props = hw_props
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(lb.parameters for lb in chain(block_size, grid_size))
         )
 
+    @property
+    def block_size(self) -> dim3 | None:
+        """Block size is only available when `evaluate` is called."""
+        return None
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        AttributeError("Setting `block_size` on an automatic launch configuration has no effect.")
+
     @property
     def parameters(self) -> frozenset[Parameter]:
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+
+        if self._hw_props.block_size_exceeds_hw_limits(block_size):
+            raise CodegenError(f"Block size {block_size} exceeds hardware limits.")
+
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
 
@@ -91,8 +165,12 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self,
+        self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False
     ) -> None:
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        self._hw_props = hw_props
+
         self._block_size: dim3 | None = None
         self._grid_size: dim3 | None = None
 
@@ -123,6 +201,18 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
         if self._grid_size is None:
             raise AttributeError("No GPU grid size was set by the user.")
 
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(self._block_size) % self._hw_props.warp_size != 0
+        ):
+            raise CodegenError(
+                "Specified block sizes must align with warp size with "
+                "`assume_warp_aligned_block_size` enabled."
+            )
+
+        if self._hw_props.block_size_exceeds_hw_limits(self._block_size):
+            raise CodegenError(self._excessive_block_size_error_msg(self._block_size))
+
         return self._block_size, self._grid_size
 
     def jit_cache_key(self) -> Any:
@@ -130,24 +220,60 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
+    """GPU launch configuration that dynamically computes the grid size from either the default block size
+    or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or
+    :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they
+    receive as an argument. The adaptation of the initial block sizes is described in the following:
 
-    The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
-    in the kernel's iteration space as follows.
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
-    - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    - if :meth:`fit_block_size` was chosen:
+
+        the initial block size is adapted such that it aligns with multiples of the hardware's warp size.
+        This is done using a fitting algorithm first trims the initial block size with the iteration space
+        and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e.
+
+        ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)``
+
+        The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`.
+
+    - elif :meth:`trim_block_size` was chosen:
+
+        a trimming between the number of work items and the kernel's iteration space occurs, i.e.
+
+        - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
+        - otherwise, ``block_size.c = init_block_size.c``
+
+        When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by
+        rounding the block size dimension that is closest the next multiple of the warp size.
+
+    - otherwise: the default block size is taken i.e.
+
+        ``block_size.c = get_default_block_size(rank=3).c``
+
+    The actual launch grid size is then computed as follows.
+
+    ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
     """
 
     def __init__(
         self,
         num_work_items: _Dim3Lambda,
-        default_block_size: dim3 | None = None,
+        hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool,
     ) -> None:
         self._num_work_items = num_work_items
 
-        self._block_size: dim3 | None = default_block_size
+        self._hw_props = hw_props
+
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        self._default_block_size = default_bs
+        self._init_block_size: dim3 = default_bs
+        self._compute_block_size: (
+            Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None
+        ) = None
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(wit.parameters for wit in num_work_items)
@@ -159,44 +285,188 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         dimension from kernel parameters."""
         return self._num_work_items
 
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
+
     @property
     def block_size(self) -> dim3 | None:
-        """The desired GPU block size."""
-        return self._block_size
+        """Block size is only available when `evaluate` is called."""
+        return None
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._block_size = val
+        AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.")
+
+    @staticmethod
+    def _round_block_sizes_to_warp_size(
+        to_round: tuple[int, ...], warp_size: int
+    ) -> tuple[int, ...]:
+        # check if already aligns with warp size
+        if prod(to_round) % warp_size == 0:
+            return tuple(to_round)
+
+        # find index of element closest to warp size and round up
+        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
+        if index_to_round + 1 < len(to_round):
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+                *to_round[index_to_round + 1:],
+            )
+        else:
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+            )
 
-    @property
-    def parameters(self) -> frozenset[Parameter]:
-        """Parameters of this launch configuration"""
-        return self._params
+    def trim_block_size(self, block_size: dim3):
+        def call_trimming_factory(
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
+        ):
+            return self._trim_block_size_to_it_space(it, bs, hw)
 
-    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        if self._block_size is None:
-            raise AttributeError("No GPU block size was specified by the user!")
+        self._init_block_size = block_size
+        self._compute_block_size = call_trimming_factory
+
+    def _trim_block_size_to_it_space(
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns specified block sizes trimmed with iteration space.
+        Raises CodegenError if trimmed block size does not conform hardware limits.
+        """
+
+        ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
+        if hw_props.block_size_exceeds_hw_limits(ret):
+            raise CodegenError(self._excessive_block_size_error_msg(ret))
+
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(ret) % self._hw_props.warp_size != 0
+        ):
+            self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
+
+        return ret
+
+    def fit_block_size(self, block_size: dim3):
+        def call_fitting_factory(
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
+        ):
+            return self._fit_block_size_to_it_space(it, bs, hw)
+
+        self._init_block_size = block_size
+        self._compute_block_size = call_fitting_factory
+
+    def _fit_block_size_to_it_space(
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns an optimized block size configuration with block sizes being aligned with the warp size.
+        Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
+        """
 
+        def trim(to_trim: list[int]) -> list[int]:
+            return [min(b, i) for b, i in zip(to_trim, it_space)]
+
+        def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
+            if hw_props.block_size_exceeds_hw_limits(ret):
+                raise CodegenError(self._excessive_block_size_error_msg(ret))
+            return ret
+
+        trimmed = trim(list(block_size))
+        if (
+            prod(trimmed) >= hw_props.warp_size
+            and prod(trimmed) % hw_props.warp_size == 0
+        ):
+            # case 1: greater than min block size -> use trimmed result
+            return check_sizes_and_return(tuple(trimmed))
+
+        prev_trim_size = 0
+        resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
+        while prod(trimmed) is not prev_trim_size:
+            prev_trim_size = prod(trimmed)
+
+            # case 2: trimmed block is equivalent to the whole iteration space
+            if all(b == i for b, i in zip(trimmed, it_space)):
+                return check_sizes_and_return(
+                    self._round_block_sizes_to_warp_size(
+                        tuple(trimmed), hw_props.warp_size
+                    )
+                )
+            else:
+                # double block size in each dimension until block is large enough (or case 2 triggers)
+                for d in resize_order:
+                    trimmed[d] *= 2
+
+                    # trim fastest moving dim to multiples of warp size
+                    if (
+                        d == 0
+                        and trimmed[d] > hw_props.warp_size
+                        and trimmed[d] % hw_props.warp_size != 0
+                    ):
+                        # subtract remainder
+                        trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
+
+                    # check if block sizes are within hardware capabilities
+                    trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d])
+
+                    # trim again
+                    trimmed = trim(trimmed)
+
+                    # case 3: trim block is large enough
+                    if prod(trimmed) >= hw_props.warp_size:
+                        return check_sizes_and_return(
+                            self._round_block_sizes_to_warp_size(
+                                tuple(trimmed), hw_props.warp_size
+                            )
+                        )
+
+        raise CodegenError("Unable to determine GPU block size for this kernel.")
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         from ..utils import div_ceil
 
         num_work_items = cast(
             dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
         )
-        reduced_block_size = cast(
-            dim3,
-            tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
-        )
+
+        block_size: dim3
+        if self._compute_block_size:
+            try:
+                computed_bs = self._compute_block_size(
+                    num_work_items, self._init_block_size, self._hw_props
+                )
+
+                block_size = cast(dim3, computed_bs)
+            except CodegenError as e:
+                block_size = self._default_block_size
+                warn(
+                    f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+                    f"Block size fitting could not determine optimal block size configuration. "
+                    f"Defaulting back to {self._default_block_size}."
+                )
+        else:
+            block_size = self._default_block_size
+
         grid_size = cast(
             dim3,
-            tuple(
-                div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
-            ),
+            tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)),
         )
 
-        return reduced_block_size, grid_size
+        return block_size, grid_size
 
     def jit_cache_key(self) -> Any:
-        return self._block_size
+        return ()
 
 
 class GpuIndexing:
@@ -218,14 +488,24 @@ class GpuIndexing:
     def __init__(
         self,
         ctx: KernelCreationContext,
+        target: Target,
         scheme: GpuIndexingScheme,
-        default_block_size: dim3 | _AUTO_TYPE | None = None,
+        warp_size: int,
         manual_launch_grid: bool = False,
+        assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
+        self._target = target
         self._scheme = scheme
-        self._default_block_size = default_block_size
+        self._warp_size = warp_size
         self._manual_launch_grid = manual_launch_grid
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        self._hw_props = HardwareProperties(
+            warp_size,
+            self.get_max_threads_per_block(target),
+            self.get_max_block_sizes(target),
+        )
 
         from ..backend.kernelcreation import AstFactory
         from .driver import KernelFactory
@@ -233,6 +513,26 @@ class GpuIndexing:
         self._ast_factory = AstFactory(self._ctx)
         self._kernel_factory = KernelFactory(self._ctx)
 
+    @staticmethod
+    def get_max_block_sizes(target: Target):
+        match target:
+            case Target.CUDA:
+                return (1024, 1024, 64)
+            case _:
+                raise CodegenError(
+                    f"Cannot determine max GPU block sizes for target {target}"
+                )
+
+    @staticmethod
+    def get_max_threads_per_block(target: Target):
+        match target:
+            case Target.CUDA:
+                return 1024
+            case _:
+                raise CodegenError(
+                    f"Cannot determine max GPU threads per block for target {target}"
+                )
+
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
 
@@ -247,7 +547,13 @@ class GpuIndexing:
     def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
         if self._manual_launch_grid:
-            return ManualLaunchConfiguration
+
+            def factory():
+                return ManualLaunchConfiguration(
+                    self._hw_props, self._assume_warp_aligned_block_size
+                )
+
+            return factory
 
         match self._scheme:
             case GpuIndexingScheme.Linear3D:
@@ -268,10 +574,9 @@ class GpuIndexing:
             )
 
         work_items_expr += tuple(
-            self._ast_factory.parse_index(1)
-            for _ in range(3 - rank)
+            self._ast_factory.parse_index(1) for _ in range(3 - rank)
         )
-        
+
         num_work_items = cast(
             _Dim3Lambda,
             tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr),
@@ -280,28 +585,12 @@ class GpuIndexing:
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
-                self._get_default_block_size(rank),
+                self._hw_props,
+                self._assume_warp_aligned_block_size,
             )
 
         return factory
 
-    def _get_default_block_size(self, rank: int) -> dim3:
-        if self._default_block_size is None:
-            raise CodegenError("The default block size option was not set")
-
-        if isinstance(self._default_block_size, _AUTO_TYPE):
-            match rank:
-                case 1:
-                    return (256, 1, 1)
-                case 2:
-                    return (128, 2, 1)
-                case 3:
-                    return (128, 2, 2)
-                case _:
-                    assert False, "unreachable code"
-        else:
-            return self._default_block_size
-
     def _get_blockwise4d_config_factory(
         self,
     ) -> Callable[[], AutomaticLaunchConfiguration]:
@@ -311,8 +600,19 @@ class GpuIndexing:
         if rank > 4:
             raise ValueError(f"Iteration space rank is too large: {rank}")
 
+        # impossible to use block size determination function since the iteration space is unknown
+        # -> round block size in fastest moving dimension up to multiple of warp size
+        rounded_block_size: PsExpression
+        if self._assume_warp_aligned_block_size:
+            warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
+            rounded_block_size = self._ast_factory.parse_index(
+                PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
+                         warp_size.clone()) * warp_size.clone())
+        else:
+            rounded_block_size = work_items[0]
+
         block_size = (
-            self._kernel_factory.create_lambda(work_items[0]),
+            self._kernel_factory.create_lambda(rounded_block_size),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
         )
@@ -328,6 +628,8 @@ class GpuIndexing:
             return AutomaticLaunchConfiguration(
                 block_size,
                 cast(_Dim3Lambda, grid_size),
+                self._hw_props,
+                self._assume_warp_aligned_block_size,
             )
 
         return factory
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index d45abf8786e8fc1789f8eae551a07b6aa556db67..4e36e369e3b65f3f6e8c8f268feae9e16e1b6b56 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Any, Sequence, cast, Callable
+from typing import Any, Callable
 from dataclasses import dataclass
 
 try:
@@ -207,19 +207,9 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
+    def __init__(self):
         self._runtime_headers = {"<cstdint>"}
 
-        if len(default_block_size) > 3:
-            raise ValueError(
-                f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
-            )
-
-        self._default_block_size: tuple[int, int, int] = cast(
-            tuple[int, int, int],
-            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
-        )
-
     def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index 0049d0a2ce36471c3cfb98bf39c965d33c24868b..5ef13f31f891ce7904ac1701272deeb1a337d488 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -316,3 +316,8 @@ def div_ceil(divident, divisor):
     The result is unspecified if either argument is negative."""
 
     return c_intdiv(divident + divisor - 1, divisor)
+
+
+def ceil_to_multiple(divident, divisor):
+    """Rounds 'divident' to the next multiple of 'divisor'."""
+    return div_ceil(divident, divisor) * divisor
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index f1905b1fcb7c7406f43cfb94af2928b6f35bc3f8..70b4edd35852b1d2ff674e6cb0421bf4fea66e1f 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -12,6 +12,7 @@ from pystencils import (
     create_kernel,
     Target,
 )
+from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties
 
 from pystencils.slicing import (
     add_ghost_layers,
@@ -19,6 +20,7 @@ from pystencils.slicing import (
     remove_ghost_layers,
     normalize_slice,
 )
+from math import prod
 
 try:
     import cupy as cp
@@ -29,10 +31,12 @@ except ImportError:
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_3d(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+    indexing_scheme: str,
+    manual_grid: bool,
+    assume_warp_aligned_block_size: bool,
 ):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
@@ -47,8 +51,8 @@ def test_indexing_options_3d(
 
     cfg = CreateKernelConfig(target=Target.CUDA)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -59,14 +63,100 @@ def test_indexing_options_3d(
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 8)
-                kernel.launch_config.grid_size = (4, 4, 2)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 10, 8)
+                    kernel.launch_config.grid_size = (5, 4, 2)
+                else:
+                    kernel.launch_config.block_size = (10, 10, 8)
+                    kernel.launch_config.grid_size = (4, 4, 2)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (32, 16, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 8)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (32, 8, 2)
+        else:
+            kernel.launch_config.block_size = (10, 10, 10)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+@pytest.mark.parametrize("iteration_space",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
+@pytest.mark.parametrize("initial_block_size",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_block_size_adaptations(
+    iteration_space: tuple[int, int, int],
+    initial_block_size: tuple[int, int, int],
+    assume_warp_aligned_block_size: bool,
+    use_block_fitting: bool,
+):
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    target = Target.CUDA
+    cfg = CreateKernelConfig(target=target)
+    cfg.gpu.indexing_scheme = "linear3d"
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+    warp_size = cfg.gpu.default_warp_size(target)
+    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
+    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    if use_block_fitting:
+        # test internal block fitting function later used in `kernel.launch_config.fit_block_size`
+        internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
+            iteration_space,
+            initial_block_size,
+            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes),
+        )
+
+        # checks if criterion for warp size alignment is fulfilled
+        def check_suitability(b):
+            return prod(b) >= warp_size and prod(b) % warp_size == 0
+
+        # block size fitting should not modify an already ideal configuration
+        # -> check if ideal configurations are modified
+        if (
+                check_suitability(initial_block_size)
+                and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
+        ):
+            assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \
+                f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
+
+        assert check_suitability(internal_block_size), \
+            "Determined block size shall be divisible by warp size."
+
+        # set block size via fitting algorithm
+        kernel.launch_config.fit_block_size(initial_block_size)
+    else:
+        # set block size via trimming algorithm
+        kernel.launch_config.trim_block_size(initial_block_size)
+
+    src_arr = cp.ones(iteration_space)
+    dst_arr = cp.zeros_like(src_arr)
 
     kernel(src=src_arr, dst=dst_arr)
 
@@ -77,10 +167,10 @@ def test_indexing_options_3d(
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_2d(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+    indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
     asm = Assignment(
@@ -93,8 +183,8 @@ def test_indexing_options_2d(
 
     cfg = CreateKernelConfig(target=Target.CUDA)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -105,14 +195,25 @@ def test_indexing_options_2d(
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 1)
-                kernel.launch_config.grid_size = (4, 2, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 8, 1)
+                    kernel.launch_config.grid_size = (5, 2, 1)
+                else:
+                    kernel.launch_config.block_size = (10, 8, 1)
+                    kernel.launch_config.grid_size = (4, 2, 1)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (16, 1, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 1)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (8, 8, 1)
+        else:
+            kernel.launch_config.block_size = (10, 8, 1)
 
     kernel(src=src_arr, dst=dst_arr)