From d9c8f260133994db177aa42db2f608cd4b418e56 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 14 Feb 2025 19:29:02 +0100 Subject: [PATCH] fix testsuite and docs --- docs/source/api/codegen.md | 2 +- docs/source/user_manual/gpu_kernels.md | 43 ++++++------------- src/pystencils/backend/platforms/cuda.py | 10 +++-- src/pystencils/codegen/config.py | 5 ++- src/pystencils/codegen/gpu_indexing.py | 4 +- tests/kernelcreation/test_iteration_slices.py | 19 +++++--- 6 files changed, 39 insertions(+), 44 deletions(-) diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md index b739a4f33..8e374d4e5 100644 --- a/docs/source/api/codegen.md +++ b/docs/source/api/codegen.md @@ -108,6 +108,7 @@ The following categories with target-specific options are exposed: VectorizationOptions GpuOptions SyclOptions + GpuIndexingScheme .. autosummary:: :toctree: generated @@ -176,5 +177,4 @@ The following categories with target-specific options are exposed: Kernel GpuKernel Parameter - GpuThreadsRange ``` diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 4db2d7944..d3a491707 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -51,12 +51,6 @@ ps.inspect(kernel) The `kernel` object returned by the code generator in above snippet is an instance of the {py:class}`GpuKernel` class. It extends {py:class}`Kernel` with some GPU-specific information. -In particular, it defines the {any}`threads_range <GpuKernel.threads_range>` -property, which tells us how many threads the kernel is expecting to be executed with: - -```{code-cell} ipython3 -kernel.threads_range -``` If a GPU is available and [CuPy][cupy] is installed in the current environment, the kernel can be compiled and run immediately. @@ -87,7 +81,7 @@ kfunc = kernel.compile() kfunc(f=f_arr, g=g_arr) ``` -### Modifying the Launch Grid +### Modifying the Launch Configuration The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object. This object holds the kernel's launch grid configuration @@ -150,26 +144,26 @@ assignments = [ ```{code-cell} ipython3 y = ps.DEFAULTS.spatial_counters[0] -cfg = ps.CreateKernelConfig( - target=ps.Target.CUDA, - iteration_slice=ps.make_slice[:, y:] -) - -kernel = ps.create_kernel(assignments, cfg).compile() +cfg = ps.CreateKernelConfig() +cfg.target= ps.Target.CUDA +cfg.iteration_slice = ps.make_slice[:, y:] ``` -This warns us that the threads range could not be determined automatically. -We can disable this warning by setting `manual_launch_grid` in the GPU option category: +In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`; +otherwise, code generation will fail as the code generator cannot figure out +a GPU grid size on its own: -```{code-cell} +```{code-cell} ipython3 cfg.gpu.manual_launch_grid = True + +kernel = ps.create_kernel(assignments, cfg).compile() ``` Now, to execute our kernel, we have to manually specify its launch grid: ```{code-cell} ipython3 -kernel.block_size = (8, 8) -kernel.num_blocks = (2, 2) +kernel.launch_config.block_size = (8, 8) +kernel.launch_config.grid_size = (2, 2) ``` This way the kernel will cover this iteration space: @@ -184,8 +178,8 @@ _draw_ispace(cp.asnumpy(f_arr)) We can also observe the effect of decreasing the launch grid size: ```{code-cell} ipython3 -kernel.block_size = (4, 4) -kernel.num_blocks = (2, 3) +kernel.launch_config.block_size = (4, 4) +kernel.launch_config.grid_size = (2, 3) ``` ```{code-cell} ipython3 @@ -199,15 +193,6 @@ Here, since there are only eight threads operating in $x$-direction, and twelve threads in $y$-direction, only a part of the triangle is being processed. -## API Reference - -```{eval-rst} -.. autosummary:: - :nosignatures: - - pystencils.codegen.GpuKernel - pystencils.jit.gpu_cupy.CupyKernelWrapper -``` :::{admonition} Developers To Do: diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 122011eb0..537c92db1 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -258,13 +258,17 @@ class CudaPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace ) -> PsBlock: - dimensions = ispace.dimensions_in_loop_order() - ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] conds = [] + + dimensions = ispace.dimensions_in_loop_order() + for dim in dimensions: + # counter declarations must be ordered slowest-to-fastest + # such that inner dimensions can depend on outer ones + dim.counter.dtype = constify(dim.counter.get_dtype()) ctr_expr = PsExpression.make(dim.counter) @@ -274,8 +278,6 @@ class CudaPlatform(GenericGpu): if not self._omit_range_check: conds.append(PsLt(ctr_expr, dim.stop)) - indexing_decls = indexing_decls[::-1] - if conds: condition: PsExpression = conds[0] for cond in conds[1:]: diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 2d62f286b..0d43b40e3 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -383,7 +383,8 @@ class GpuOptions(ConfigBase): block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) """Desired block size for the execution of GPU kernels. - This option only takes effect if `Linear3D` is chosen as an indexing scheme. + This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>` + is chosen as an indexing scheme. The block size may be overridden at runtime. """ @@ -573,7 +574,7 @@ class CreateKernelConfig(ConfigBase): """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead.""" gpu_indexing: InitVar[str | None] = None - """Deprecated; use `gpu.indexing_scheme` instead.""" + """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead.""" gpu_indexing_params: InitVar[dict | None] = None """Deprecated; set options in the `gpu` category instead.""" diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 1e23a820e..80af8aba8 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -56,7 +56,7 @@ class GpuLaunchConfiguration(ABC): class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """Launch configuration that is dynamically computed from kernel parameters. - This launch configuration permits no further user customization + This launch configuration permits no further user customization. """ def __init__( @@ -233,7 +233,7 @@ class GpuIndexing(ABC): from ..backend.ast.expressions import PsExpression, PsIntDiv block_size_symbols = [ - self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank) + self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank) ] block_size = [ diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index 02b6b9922..b1f2da576 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem from pystencils.simp import sympy_cse_on_assignment_list from pystencils.slicing import normalize_slice from pystencils.jit.gpu_cupy import CupyKernelWrapper +from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration def test_sliced_iteration(): @@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): expected[r, r:] = 1.0 update = Assignment(f.center(), 1) - outer_counter = DEFAULTS.spatial_counters[0] - islice = make_slice[:, outer_counter:] + + # Have NumPy data layout -> X is slowest coordinate, Y is fastest + slow_counter = DEFAULTS.spatial_counters[0] + islice = make_slice[:, slow_counter:] gen_config = replace(gen_config, iteration_slice=islice) if gen_config.target == Target.CUDA: @@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): kernel = create_kernel(update, gen_config).compile() if isinstance(kernel, CupyKernelWrapper): - kernel.block_size = shape + (1,) - kernel.num_blocks = (1, 1, 1) + assert isinstance(kernel.launch_config, ManualLaunchConfiguration) + + kernel.launch_config.block_size = shape + (1,) + kernel.launch_config.grid_size = (1, 1, 1) kernel(f=f_arr) @@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): pytest.xfail("Gather/Scatter not implemented yet") if isinstance(kernel, CupyKernelWrapper): - kernel.block_size = (8, 16, 1) - kernel.num_blocks = (1, 1, 1) + assert isinstance(kernel.launch_config, ManualLaunchConfiguration) + + kernel.launch_config.block_size = (8, 16, 1) + kernel.launch_config.grid_size = (1, 1, 1) kernel(f=f_arr) -- GitLab