Skip to content
Snippets Groups Projects
Commit d9c8f260 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

fix testsuite and docs

parent 26eb2d62
No related branches found
No related tags found
1 merge request!449GPU Indexing Schemes and Launch Configurations
Pipeline #74069 failed
......@@ -108,6 +108,7 @@ The following categories with target-specific options are exposed:
VectorizationOptions
GpuOptions
SyclOptions
GpuIndexingScheme
.. autosummary::
:toctree: generated
......@@ -176,5 +177,4 @@ The following categories with target-specific options are exposed:
Kernel
GpuKernel
Parameter
GpuThreadsRange
```
......@@ -51,12 +51,6 @@ ps.inspect(kernel)
The `kernel` object returned by the code generator in above snippet is an instance
of the {py:class}`GpuKernel` class.
It extends {py:class}`Kernel` with some GPU-specific information.
In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
property, which tells us how many threads the kernel is expecting to be executed with:
```{code-cell} ipython3
kernel.threads_range
```
If a GPU is available and [CuPy][cupy] is installed in the current environment,
the kernel can be compiled and run immediately.
......@@ -87,7 +81,7 @@ kfunc = kernel.compile()
kfunc(f=f_arr, g=g_arr)
```
### Modifying the Launch Grid
### Modifying the Launch Configuration
The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
This object holds the kernel's launch grid configuration
......@@ -150,26 +144,26 @@ assignments = [
```{code-cell} ipython3
y = ps.DEFAULTS.spatial_counters[0]
cfg = ps.CreateKernelConfig(
target=ps.Target.CUDA,
iteration_slice=ps.make_slice[:, y:]
)
kernel = ps.create_kernel(assignments, cfg).compile()
cfg = ps.CreateKernelConfig()
cfg.target= ps.Target.CUDA
cfg.iteration_slice = ps.make_slice[:, y:]
```
This warns us that the threads range could not be determined automatically.
We can disable this warning by setting `manual_launch_grid` in the GPU option category:
In this case, it is necessary to set the `gpu.manual_launch_grid` option to `True`;
otherwise, code generation will fail as the code generator cannot figure out
a GPU grid size on its own:
```{code-cell}
```{code-cell} ipython3
cfg.gpu.manual_launch_grid = True
kernel = ps.create_kernel(assignments, cfg).compile()
```
Now, to execute our kernel, we have to manually specify its launch grid:
```{code-cell} ipython3
kernel.block_size = (8, 8)
kernel.num_blocks = (2, 2)
kernel.launch_config.block_size = (8, 8)
kernel.launch_config.grid_size = (2, 2)
```
This way the kernel will cover this iteration space:
......@@ -184,8 +178,8 @@ _draw_ispace(cp.asnumpy(f_arr))
We can also observe the effect of decreasing the launch grid size:
```{code-cell} ipython3
kernel.block_size = (4, 4)
kernel.num_blocks = (2, 3)
kernel.launch_config.block_size = (4, 4)
kernel.launch_config.grid_size = (2, 3)
```
```{code-cell} ipython3
......@@ -199,15 +193,6 @@ Here, since there are only eight threads operating in $x$-direction,
and twelve threads in $y$-direction,
only a part of the triangle is being processed.
## API Reference
```{eval-rst}
.. autosummary::
:nosignatures:
pystencils.codegen.GpuKernel
pystencils.jit.gpu_cupy.CupyKernelWrapper
```
:::{admonition} Developers To Do:
......
......@@ -258,13 +258,17 @@ class CudaPlatform(GenericGpu):
def _prepend_dense_translation(
self, body: PsBlock, ispace: FullIterationSpace
) -> PsBlock:
dimensions = ispace.dimensions_in_loop_order()
ctr_mapping = self._thread_mapping(ispace)
indexing_decls = []
conds = []
dimensions = ispace.dimensions_in_loop_order()
for dim in dimensions:
# counter declarations must be ordered slowest-to-fastest
# such that inner dimensions can depend on outer ones
dim.counter.dtype = constify(dim.counter.get_dtype())
ctr_expr = PsExpression.make(dim.counter)
......@@ -274,8 +278,6 @@ class CudaPlatform(GenericGpu):
if not self._omit_range_check:
conds.append(PsLt(ctr_expr, dim.stop))
indexing_decls = indexing_decls[::-1]
if conds:
condition: PsExpression = conds[0]
for cond in conds[1:]:
......
......@@ -383,7 +383,8 @@ class GpuOptions(ConfigBase):
block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
"""Desired block size for the execution of GPU kernels.
This option only takes effect if `Linear3D` is chosen as an indexing scheme.
This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
is chosen as an indexing scheme.
The block size may be overridden at runtime.
"""
......@@ -573,7 +574,7 @@ class CreateKernelConfig(ConfigBase):
"""Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
gpu_indexing: InitVar[str | None] = None
"""Deprecated; use `gpu.indexing_scheme` instead."""
"""Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead."""
gpu_indexing_params: InitVar[dict | None] = None
"""Deprecated; set options in the `gpu` category instead."""
......
......@@ -56,7 +56,7 @@ class GpuLaunchConfiguration(ABC):
class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
"""Launch configuration that is dynamically computed from kernel parameters.
This launch configuration permits no further user customization
This launch configuration permits no further user customization.
"""
def __init__(
......@@ -233,7 +233,7 @@ class GpuIndexing(ABC):
from ..backend.ast.expressions import PsExpression, PsIntDiv
block_size_symbols = [
self._ctx.get_new_symbol(f"gpuBlockSize_{c}") for c in range(rank)
self._ctx.get_new_symbol(f"gpuBlockSize_{c}", self._ctx.index_dtype) for c in range(rank)
]
block_size = [
......
......@@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem
from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import normalize_slice
from pystencils.jit.gpu_cupy import CupyKernelWrapper
from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration
def test_sliced_iteration():
......@@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
expected[r, r:] = 1.0
update = Assignment(f.center(), 1)
outer_counter = DEFAULTS.spatial_counters[0]
islice = make_slice[:, outer_counter:]
# Have NumPy data layout -> X is slowest coordinate, Y is fastest
slow_counter = DEFAULTS.spatial_counters[0]
islice = make_slice[:, slow_counter:]
gen_config = replace(gen_config, iteration_slice=islice)
if gen_config.target == Target.CUDA:
......@@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
kernel = create_kernel(update, gen_config).compile()
if isinstance(kernel, CupyKernelWrapper):
kernel.block_size = shape + (1,)
kernel.num_blocks = (1, 1, 1)
assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
kernel.launch_config.block_size = shape + (1,)
kernel.launch_config.grid_size = (1, 1, 1)
kernel(f=f_arr)
......@@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
pytest.xfail("Gather/Scatter not implemented yet")
if isinstance(kernel, CupyKernelWrapper):
kernel.block_size = (8, 16, 1)
kernel.num_blocks = (1, 1, 1)
assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
kernel.launch_config.block_size = (8, 16, 1)
kernel.launch_config.grid_size = (1, 1, 1)
kernel(f=f_arr)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment