From d9c8f260133994db177aa42db2f608cd4b418e56 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Feb 2025 19:29:02 +0100
Subject: [PATCH] fix testsuite and docs

---
 docs/source/api/codegen.md                    |  2 +-
 docs/source/user_manual/gpu_kernels.md        | 43 ++++++-------------
 src/pystencils/backend/platforms/cuda.py      | 10 +++--
 src/pystencils/codegen/config.py              |  5 ++-
 src/pystencils/codegen/gpu_indexing.py        |  4 +-
 tests/kernelcreation/test_iteration_slices.py | 19 +++++---
 6 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md
index b739a4f33..8e374d4e5 100644
--- a/docs/source/api/codegen.md
+++ b/docs/source/api/codegen.md
@@ -108,6 +108,7 @@ The following categories with target-specific options are exposed:
   VectorizationOptions
   GpuOptions
   SyclOptions
+  GpuIndexingScheme
 
 .. autosummary::
   :toctree: generated
@@ -176,5 +177,4 @@ The following categories with target-specific options are exposed:
   Kernel
   GpuKernel
   Parameter
-  GpuThreadsRange
 ```
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 4db2d7944..d3a491707 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -51,12 +51,6 @@ ps.inspect(kernel)
 The `kernel` object returned by the code generator in above snippet is an instance
 of the {py:class}`GpuKernel` class.
 It extends {py:class}`Kernel` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
-property, which tells us how many threads the kernel is expecting to be executed with:
-
-```{code-cell} ipython3
-kernel.threads_range
-```
 
 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
@@ -87,7 +81,7 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```
 
-### Modifying the Launch Grid
+### Modifying the Launch Configuration
 
 The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
 This object holds the kernel's launch grid configuration
@@ -150,26 +144,26 @@ assignments = [
 
 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
-cfg = ps.CreateKernelConfig(
-    target=ps.Target.CUDA,
-    iteration_slice=ps.make_slice[:, y:]
-)
-    
-kernel = ps.create_kernel(assignments, cfg).compile()
+cfg = ps.CreateKernelConfig()
+cfg.target= ps.Target.CUDA
+cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
-This warns us that the threads range could not be determined automatically.
-We can disable this warning by setting `manual_launch_grid` in the GPU option category:
+In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`;
+otherwise, code generation will fail as the code generator cannot figure out
+a GPU grid size on its own:
 
-```{code-cell}
+```{code-cell} ipython3
 cfg.gpu.manual_launch_grid = True
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
 ```
 
 Now, to execute our kernel, we have to manually specify its launch grid:
 
 ```{code-cell} ipython3
-kernel.block_size = (8, 8)
-kernel.num_blocks = (2, 2)
+kernel.launch_config.block_size = (8, 8)
+kernel.launch_config.grid_size = (2, 2)
 ```
 
 This way the kernel will cover this iteration space:
@@ -184,8 +178,8 @@ _draw_ispace(cp.asnumpy(f_arr))
 We can also observe the effect of decreasing the launch grid size:
 
 ```{code-cell} ipython3
-kernel.block_size = (4, 4)
-kernel.num_blocks = (2, 3)
+kernel.launch_config.block_size = (4, 4)
+kernel.launch_config.grid_size = (2, 3)
 ```
 
 ```{code-cell} ipython3
@@ -199,15 +193,6 @@ Here, since there are only eight threads operating in $x$-direction,
 and twelve threads in $y$-direction,
 only a part of the triangle is being processed.
 
-## API Reference
-
-```{eval-rst}
-.. autosummary::
-  :nosignatures:
-
-  pystencils.codegen.GpuKernel
-  pystencils.jit.gpu_cupy.CupyKernelWrapper
-```
 
 :::{admonition} Developers To Do:
 
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 122011eb0..537c92db1 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -258,13 +258,17 @@ class CudaPlatform(GenericGpu):
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> PsBlock:
-        dimensions = ispace.dimensions_in_loop_order()
-
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
         conds = []
+
+        dimensions = ispace.dimensions_in_loop_order()
+
         for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
             ctr_expr = PsExpression.make(dim.counter)
@@ -274,8 +278,6 @@ class CudaPlatform(GenericGpu):
             if not self._omit_range_check:
                 conds.append(PsLt(ctr_expr, dim.stop))
 
-        indexing_decls = indexing_decls[::-1]
-
         if conds:
             condition: PsExpression = conds[0]
             for cond in conds[1:]:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 2d62f286b..0d43b40e3 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -383,7 +383,8 @@ class GpuOptions(ConfigBase):
     block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
     """Desired block size for the execution of GPU kernels.
     
-    This option only takes effect if `Linear3D` is chosen as an indexing scheme.
+    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
+    is chosen as an indexing scheme.
     The block size may be overridden at runtime.
     """
 
@@ -573,7 +574,7 @@ class CreateKernelConfig(ConfigBase):
     """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
 
     gpu_indexing: InitVar[str | None] = None
-    """Deprecated; use `gpu.indexing_scheme` instead."""
+    """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead."""
 
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; set options in the `gpu` category instead."""
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 1e23a820e..80af8aba8 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -56,7 +56,7 @@ class GpuLaunchConfiguration(ABC):
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
 
-    This launch configuration permits no further user customization
+    This launch configuration permits no further user customization.
     """
 
     def __init__(
@@ -233,7 +233,7 @@ class GpuIndexing(ABC):
         from ..backend.ast.expressions import PsExpression, PsIntDiv
 
         block_size_symbols = [
-            self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
+            self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
         ]
 
         block_size = [
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index 02b6b9922..b1f2da576 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import normalize_slice
 from pystencils.jit.gpu_cupy import CupyKernelWrapper
+from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration
 
 
 def test_sliced_iteration():
@@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
         expected[r, r:] = 1.0
 
     update = Assignment(f.center(), 1)
-    outer_counter = DEFAULTS.spatial_counters[0]
-    islice = make_slice[:, outer_counter:]
+
+    #   Have NumPy data layout -> X is slowest coordinate, Y is fastest
+    slow_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, slow_counter:]
     gen_config = replace(gen_config, iteration_slice=islice)
 
     if gen_config.target == Target.CUDA:
@@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
     kernel = create_kernel(update, gen_config).compile()
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = shape + (1,)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = shape + (1,)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
@@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
             pytest.xfail("Gather/Scatter not implemented yet")
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = (8, 16, 1)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = (8, 16, 1)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
-- 
GitLab