Skip to content
Snippets Groups Projects
Commit 6c635e09 authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Merge branch 'devel/fix_reduction_docs' into 'v2.0-dev'

Fix output of warp-level kernel in reduction user guide

See merge request !465
parents 1cea8375 ac5bc2ce
No related branches found
No related tags found
1 merge request!465Fix output of warp-level kernel in reduction user guide
Pipeline #78949 passed
......@@ -88,8 +88,8 @@ create the kernel object via the {any}`create_kernel` function.
For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled.
```{code-cell} ipython3
cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
kernel = ps.create_kernel(assign_sum, cfg)
cpu_cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
kernel = ps.create_kernel(assign_sum, cpu_cfg)
ps.inspect(kernel)
```
......@@ -111,14 +111,14 @@ but will be incorporated in the reduction computation.
Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
```{code-cell} ipython3
kernel_func = kernel.compile()
kernel_func = kernel.compile()
x_array = np.ones((4, 4, 4), dtype="float64")
reduction_result = np.zeros((1,), dtype="float64")
x_array = np.ones((4, 4, 4), dtype="float64")
reduction_result = np.zeros((1,), dtype="float64")
kernel_func(x=x_array, r=reduction_result)
reduction_result[0]
kernel_func(x=x_array, r=reduction_result)
reduction_result[0]
```
### GPU Platforms
......@@ -128,11 +128,11 @@ Similar to the CPU section, a base variant for NVIDIA GPUs without
explicitly employing any optimizations is shown:
```{code-cell} ipython3
cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
gpu_cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
kernel_gpu = ps.create_kernel(assign_sum, cfg)
kernel_gpu = ps.create_kernel(assign_sum, gpu_cfg)
ps.inspect(kernel_gpu)
ps.inspect(kernel_gpu)
```
The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer
......@@ -159,17 +159,17 @@ which are not supported yet.
```{code-cell} ipython3
# configure SIMD vectorization
cfg = ps.CreateKernelConfig(
cpu_cfg_opt = ps.CreateKernelConfig(
target=ps.Target.X86_AVX,
)
cfg.cpu.vectorize.enable = True
cfg.cpu.vectorize.assume_inner_stride_one = True
cpu_cfg_opt.cpu.vectorize.enable = True
cpu_cfg_opt.cpu.vectorize.assume_inner_stride_one = True
# configure OpenMP parallelization
cfg.cpu.openmp.enable = True
cfg.cpu.openmp.num_threads = 8
cpu_cfg_opt.cpu.openmp.enable = True
cpu_cfg_opt.cpu.openmp.num_threads = 8
kernel_cpu_opt = ps.create_kernel(assign_sum, cfg)
kernel_cpu_opt = ps.create_kernel(assign_sum, cpu_cfg_opt)
ps.inspect(kernel_cpu_opt)
```
......@@ -190,13 +190,14 @@ we employ a block fitting algorithm to obtain a block size that is also optimize
You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
```{code-cell} ipython3
cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
cfg.gpu.assume_warp_aligned_block_size = True
gpu_cfg_opt = ps.CreateKernelConfig(target=ps.Target.CUDA)
gpu_cfg_opt.gpu.assume_warp_aligned_block_size = True
gpu_cfg_opt.gpu.warp_size = 32
kernel_gpu_opt = ps.create_kernel(assign_sum, gpu_cfg_opt)
kernel_gpu_opt = ps.create_kernel(assign_sum, cfg)
kernel_func = kernel_gpu_opt.compile()
kernel_func.launch_config.fit_block_size((32, 1, 1))
kernel_func = kernel_gpu_opt.compile()
kernel_func.launch_config.fit_block_size((32, 1, 1))
ps.inspect(kernel_gpu_opt)
ps.inspect(kernel_gpu_opt)
```
......@@ -256,21 +256,18 @@ class GenericGpu(Platform):
raise MaterializationError(f"Unknown type of iteration space: {ispace}")
@staticmethod
def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
def _block_local_thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
"""Returns thread indices multiplied with block dimension strides per dimension."""
return tuple(
idx
* PsConstantExpr(
PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
)
idx * reduce(operator.mul, BLOCK_DIM[:i]) if i > 0 else idx
for i, idx in enumerate(THREAD_IDX[: ispace.rank])
)
def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
"""Returns expression that determines whether a thread is the first within a warp."""
tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
tids_per_dim = GenericGpu._block_local_thread_index_per_dim(ispace)
tid: PsExpression = tids_per_dim[0]
for t in tids_per_dim[1:]:
tid = PsAdd(tid, t)
......
......@@ -5,21 +5,35 @@ import pystencils as ps
from pystencils import Target
from pystencils.sympyextensions import ReductionOp, reduction_assignment
INIT_W = 5
INIT_ARR = 2
SIZE = 15
SOLUTION = {
ReductionOp.Add: INIT_W + INIT_ARR * SIZE,
ReductionOp.Sub: INIT_W - INIT_ARR * SIZE,
ReductionOp.Mul: INIT_W * INIT_ARR**SIZE,
ReductionOp.Min: min(INIT_W, INIT_ARR),
ReductionOp.Max: max(INIT_W, INIT_ARR),
}
init_w = 5.0
arr_size = 16
def init_arr(op):
match op:
case ReductionOp.Mul:
return 0.99 # avoid value overflow for large array sizes
case _:
return 2.0
def get_expected_solution(op, array):
match op:
case ReductionOp.Add:
return init_w + np.sum(array)
case ReductionOp.Sub:
return init_w - np.sum(array)
case ReductionOp.Mul:
return init_w * np.prod(array)
case ReductionOp.Min:
return min(init_w, np.min(array))
case ReductionOp.Max:
return max(init_w, np.max(array))
# get AST for kernel with reduction assignment
def get_reduction_assign_ast(dtype, op, config):
x = ps.fields(f"x: {dtype}[1d]")
def get_reduction_assign_ast(dtype, op, dims, config):
x = ps.fields(f"x: {dtype}[{dims}d]")
w = ps.TypedSymbol("w", dtype)
red_assign = reduction_assignment(w, op, x.center())
......@@ -27,15 +41,22 @@ def get_reduction_assign_ast(dtype, op, config):
return ps.create_kernel([red_assign], config, default_dtype=dtype)
def get_cpu_array(op, dtype):
def get_cpu_array(dtype, op, dims):
shape = (arr_size,) * dims
# increase difficulty of min/max tests by using range of values
match op:
case ReductionOp.Min:
return np.linspace(INIT_ARR, INIT_ARR + SIZE, SIZE, dtype=dtype)
case ReductionOp.Max:
return np.linspace(INIT_ARR - SIZE, INIT_ARR, SIZE, dtype=dtype)
case ReductionOp.Min | ReductionOp.Max:
lo = init_arr(op) - arr_size
mi = init_arr(op)
hi = init_arr(op) + arr_size
if op is ReductionOp.Min:
return np.random.randint(mi, hi, size=shape).astype(dtype)
else:
return np.random.randint(lo, mi, size=shape).astype(dtype)
case _:
return np.full((SIZE,), INIT_ARR, dtype=dtype)
return np.full(shape, init_arr(op), dtype=dtype)
@pytest.mark.parametrize(
......@@ -52,7 +73,12 @@ def get_cpu_array(op, dtype):
ReductionOp.Max,
],
)
def test_reduction_cpu(target, dtype, op):
@pytest.mark.parametrize("dims", [1, 2, 3])
def test_reduction_cpu(
target: ps.Target,
dtype: str,
op: str,
dims: int):
config = ps.CreateKernelConfig(target=target)
config.cpu.openmp.enable = True
......@@ -60,14 +86,14 @@ def test_reduction_cpu(target, dtype, op):
config.cpu.vectorize.enable = True
config.cpu.vectorize.assume_inner_stride_one = True
ast_reduction = get_reduction_assign_ast(dtype, op, config)
ast_reduction = get_reduction_assign_ast(dtype, op, dims, config)
kernel_reduction = ast_reduction.compile()
array = get_cpu_array(op, dtype)
reduction_array = np.full((1,), INIT_W, dtype=dtype)
array = get_cpu_array(dtype, op, dims)
reduction_array = np.full((1,), init_w, dtype=dtype)
kernel_reduction(x=array, w=reduction_array)
assert np.allclose(reduction_array, SOLUTION[op])
assert np.allclose(reduction_array, get_expected_solution(op, array))
@pytest.mark.parametrize("dtype", ["float64", "float32"])
......@@ -81,6 +107,7 @@ def test_reduction_cpu(target, dtype, op):
ReductionOp.Max,
],
)
@pytest.mark.parametrize("dims", [1, 2, 3])
@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
@pytest.mark.parametrize("use_block_fitting", [True, False])
@pytest.mark.parametrize("warp_size", [32, None])
......@@ -90,6 +117,7 @@ def test_reduction_cpu(target, dtype, op):
def test_reduction_gpu(
dtype: str,
op: str,
dims: int,
assume_warp_aligned_block_size: bool,
use_block_fitting: bool,
warp_size: int | None,
......@@ -101,17 +129,17 @@ def test_reduction_gpu(
if warp_size:
cfg.gpu.warp_size = warp_size
ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
ast_reduction = get_reduction_assign_ast(dtype, op, dims, cfg)
kernel_reduction = ast_reduction.compile()
if use_block_fitting and warp_size:
kernel_reduction.launch_config.fit_block_size((warp_size, 1, 1))
array = get_cpu_array(op, dtype)
reduction_array = np.full((1,), INIT_W, dtype=dtype)
array = get_cpu_array(dtype, op, dims)
reduction_array = np.full((1,), init_w, dtype=dtype)
array_gpu = cp.asarray(array)
reduction_array_gpu = cp.asarray(reduction_array)
kernel_reduction(x=array_gpu, w=reduction_array_gpu)
assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
assert np.allclose(reduction_array_gpu.get(), get_expected_solution(op, array))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment