fix testsuite and docs

d9c8f260 · Frederik Hennig · 26eb2d62 · d9c8f260 · d9c8f260 · d9c8f260
Commit d9c8f260 authored 5 months ago by Frederik Hennig
--- a/docs/source/api/codegen.md
+++ b/docs/source/api/codegen.md
@@ -108,6 +108,7 @@ The following categories with target-specific options are exposed:
  VectorizationOptions
  GpuOptions
  SyclOptions
+  GpuIndexingScheme

 .. autosummary::
  :toctree: generated
@@ -176,5 +177,4 @@ The following categories with target-specific options are exposed:
  Kernel
  GpuKernel
  Parameter
-  GpuThreadsRange
 ```
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -51,12 +51,6 @@ ps.inspect(kernel)
 The `kernel` object returned by the code generator in above snippet is an instance
 of the {py:class}`GpuKernel` class.
 It extends {py:class}`Kernel` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
-property, which tells us how many threads the kernel is expecting to be executed with:
-
-```{code-cell} ipython3
-kernel.threads_range
-```

 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
@@ -87,7 +81,7 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```

-### Modifying the Launch Grid
+### Modifying the Launch Configuration

 The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
 This object holds the kernel's launch grid configuration
@@ -150,26 +144,26 @@ assignments = [

 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
-cfg = ps.CreateKernelConfig(
-    target=ps.Target.CUDA,
-    iteration_slice=ps.make_slice[:, y:]
-)
-    
-kernel = ps.create_kernel(assignments, cfg).compile()
+cfg = ps.CreateKernelConfig()
+cfg.target= ps.Target.CUDA
+cfg.iteration_slice = ps.make_slice[:, y:]
 ```

-This warns us that the threads range could not be determined automatically.
-We can disable this warning by setting `manual_launch_grid` in the GPU option category:
+In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`;
+otherwise, code generation will fail as the code generator cannot figure out
+a GPU grid size on its own:

-```{code-cell}
+```{code-cell} ipython3
 cfg.gpu.manual_launch_grid = True
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
 ```

 Now, to execute our kernel, we have to manually specify its launch grid:

 ```{code-cell} ipython3
-kernel.block_size = (8, 8)
-kernel.num_blocks = (2, 2)
+kernel.launch_config.block_size = (8, 8)
+kernel.launch_config.grid_size = (2, 2)
 ```

 This way the kernel will cover this iteration space:
@@ -184,8 +178,8 @@ _draw_ispace(cp.asnumpy(f_arr))
 We can also observe the effect of decreasing the launch grid size:

 ```{code-cell} ipython3
-kernel.block_size = (4, 4)
-kernel.num_blocks = (2, 3)
+kernel.launch_config.block_size = (4, 4)
+kernel.launch_config.grid_size = (2, 3)
 ```

 ```{code-cell} ipython3
@@ -199,15 +193,6 @@ Here, since there are only eight threads operating in $x$-direction,
 and twelve threads in $y$-direction,
 only a part of the triangle is being processed.

-## API Reference
-
-```{eval-rst}
-.. autosummary::
-  :nosignatures:
-
-  pystencils.codegen.GpuKernel
-  pystencils.jit.gpu_cupy.CupyKernelWrapper
-```

 :::{admonition} Developers To Do:


--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -258,13 +258,17 @@ class CudaPlatform(GenericGpu):
    def _prepend_dense_translation(
        self, body: PsBlock, ispace: FullIterationSpace
    ) -> PsBlock:
-        dimensions = ispace.dimensions_in_loop_order()
-
        ctr_mapping = self._thread_mapping(ispace)

        indexing_decls = []
        conds = []
+
+        dimensions = ispace.dimensions_in_loop_order()
+
        for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
            dim.counter.dtype = constify(dim.counter.get_dtype())

            ctr_expr = PsExpression.make(dim.counter)
@@ -274,8 +278,6 @@ class CudaPlatform(GenericGpu):
            if not self._omit_range_check:
                conds.append(PsLt(ctr_expr, dim.stop))

-        indexing_decls = indexing_decls[::-1]
-
        if conds:
            condition: PsExpression = conds[0]
            for cond in conds[1:]:

--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -383,7 +383,8 @@ class GpuOptions(ConfigBase):
    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
    """Desired block size for the execution of GPU kernels.
    
-    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
+    is chosen as an indexing scheme.
    The block size may be overridden at runtime.
    """

@@ -573,7 +574,7 @@ class CreateKernelConfig(ConfigBase):
    """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""

    gpu_indexing: InitVar[str | None] = None
-    """Deprecated; use `gpu.indexing_scheme` instead."""
+    """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead."""

    gpu_indexing_params: InitVar[dict | None] = None
    """Deprecated; set options in the `gpu` category instead."""

--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -56,7 +56,7 @@ class GpuLaunchConfiguration(ABC):
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
    """Launch configuration that is dynamically computed from kernel parameters.

-    This launch configuration permits no further user customization
+    This launch configuration permits no further user customization.
    """

    def __init__(
@@ -233,7 +233,7 @@ class GpuIndexing(ABC):
        from ..backend.ast.expressions import PsExpression, PsIntDiv

        block_size_symbols = [
-            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
+            self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
        ]

        block_size = [

--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import normalize_slice
 from pystencils.jit.gpu_cupy import CupyKernelWrapper
+from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration


 def test_sliced_iteration():
@@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
        expected[r, r:] = 1.0

    update = Assignment(f.center(), 1)
-    outer_counter = DEFAULTS.spatial_counters[0]
-    islice = make_slice[:, outer_counter:]
+
+    #   Have NumPy data layout -> X is slowest coordinate, Y is fastest
+    slow_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, slow_counter:]
    gen_config = replace(gen_config, iteration_slice=islice)

    if gen_config.target == Target.CUDA:
@@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
    kernel = create_kernel(update, gen_config).compile()

    if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = shape + (1,)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = shape + (1,)
+        kernel.launch_config.grid_size = (1, 1, 1)

    kernel(f=f_arr)

@@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
            pytest.xfail("Gather/Scatter not implemented yet")

    if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = (8, 16, 1)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = (8, 16, 1)
+        kernel.launch_config.grid_size = (1, 1, 1)

    kernel(f=f_arr)