From 8af40ae4a930afd3b91078639b897050781c6c8a Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Thu, 12 Dec 2024 11:06:52 +0100 Subject: [PATCH] Clarify some doc comments; clarify launch grid specification --- docs/source/reference/gpu_kernels.md | 19 +++++++++++++------ src/pystencils/config.py | 7 ++++++- src/pystencils/target.py | 2 +- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/reference/gpu_kernels.md index c3fa70ec2..1045f80d4 100644 --- a/docs/source/reference/gpu_kernels.md +++ b/docs/source/reference/gpu_kernels.md @@ -78,19 +78,26 @@ kfunc(f=f_arr, g=g_arr) ### Modifying the Launch Grid The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object. -Its interface allows us to customize the GPU launch grid. -We can manually set both the number of threads per block, and the number of blocks on the grid: +This object holds the kernel's launch grid configuration +(i.e. the number of thread blocks, and the number of threads per block.) +Pystencils specifies a default value for the block size and if possible, +the number of blocks is automatically inferred in order to cover the entire iteration space. +In addition, the wrapper's interface allows us to customize the GPU launch grid, +by manually setting both the number of threads per block, and the number of blocks on the grid: ```{code-cell} ipython3 kfunc.block_size = (16, 8, 8) kfunc.num_blocks = (1, 2, 2) ``` -In most cases, the number of blocks is automatically inferred from the block size -in order to cover the entire iteration space, so it does not need to be specified. -Setting a launch grid that is larger than the iteration space is also possible, -but will cause any threads working outside of the iteration bounds to idle. +For most kernels, setting only the `block_size` is sufficient since pystencils will +automatically compute the number of blocks; +for exceptions to this, see [](#manual_launch_grids). +If `num_blocks` is set manually and the launch grid thus specified is too small, only +a part of the iteration space will be traversed by the kernel; +similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle. +(manual_launch_grids)= ### Manual Launch Grids and Non-Cuboid Iteration Patterns In some cases, it will be unavoidable to set the launch grid size manually; diff --git a/src/pystencils/config.py b/src/pystencils/config.py index c08ddc161..506f7fd78 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -33,7 +33,12 @@ class _AUTO_TYPE: AUTO = _AUTO_TYPE() -"""Special value that can be passed to some options for invoking automatic behaviour.""" +"""Special value that can be passed to some options for invoking automatic behaviour. + +Currently, these options permit `AUTO`: + +- `ghost_layers <CreateKernelConfig.ghost_layers>` +""" @dataclass diff --git a/src/pystencils/target.py b/src/pystencils/target.py index 7f26c4466..5d897dd7e 100644 --- a/src/pystencils/target.py +++ b/src/pystencils/target.py @@ -87,7 +87,7 @@ class Target(Flag): """ GPU = CUDA - """Alias for backward compatibility.""" + """Alias for `Target.CUDA`, for backward compatibility.""" SYCL = _GPU | _SYCL """SYCL kernel target. -- GitLab