From 6b3f5288dcdc074325667d40682b5e2610962348 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 17 Feb 2025 11:47:40 +0100
Subject: [PATCH] small extension to the user guide

---
 docs/source/user_manual/gpu_kernels.md | 45 ++++++++++++++++++--------
 tests/kernelcreation/test_gpu.py       |  6 +---
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 2fa7cd056..610c61ddf 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -54,7 +54,19 @@ It extends {py:class}`Kernel` with some GPU-specific information.
 
 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
-To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field:
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
 
 :::{note}
 [CuPy][cupy] is a Python library for numerical computations on GPU arrays,
@@ -69,18 +81,6 @@ and to allocate and manage the data these kernels can be executed on.
 For more information on CuPy, refer to [their documentation][cupy-docs].
 :::
 
-```{code-cell} ipython3
-:tags: [raises-exception]
-import cupy as cp
-
-rng = cp.random.default_rng(seed=42)
-f_arr = rng.random((16, 16, 16))
-g_arr = cp.zeros_like(f_arr)
-
-kfunc = kernel.compile()
-kfunc(f=f_arr, g=g_arr)
-```
-
 (indexing_and_launch_config)=
 ## Modify the Indexing Scheme and Launch Configuration
 
@@ -130,6 +130,25 @@ kfunc(f=f_arr, g=g_arr)
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
 and the given thread block size.
 
+:::{attention}
+
+According to the way GPU architecture splits thread blocks into warps,
+pystencils will map the kernel's *fastest* spatial coordinate onto the `x` block and thread
+indices, the second-fastest to `y`, and the slowest coordiante to `z`.
+
+This can mean that, when using `cupy` arrays with the default memory layout
+(corresponding to the `"numpy"` field layout specifier),
+the *thread coordinates* and the *spatial coordinates*
+map to each other in *opposite order*; e.g.
+
+| Spatial Coordinate | Thread Index  |
+|--------------------|---------------|
+| `x` (slowest)      | `threadIdx.z` |
+| `y`                | `threadIdx.y` |
+| `z` (fastest)      | `threadIdx.x` |
+
+:::
+
 (manual_launch_grids)=
 ### Manual Launch Grids and Non-Cuboid Iteration Patterns
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index d80647fb6..75239c9b1 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -67,11 +67,7 @@ def test_indexing_options(
                 kernel.launch_config.grid_size = (32, 16, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (
-            10,
-            8,
-            8,
-        )  # must fit the src_arr shape (without ghost layers)
+        kernel.launch_config.block_size = (10, 8, 8)
 
     kernel(src=src_arr, dst=dst_arr)
 
-- 
GitLab