Skip to content
Snippets Groups Projects
Commit 617a9282 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Add test cases for various sliced iterations. Fix trailing iters in loop...

Add test cases for various sliced iterations. Fix trailing iters in loop vectorizer when simd loop is never entered. Extract default gen configs to fixtures.
parent f6769587
No related branches found
No related tags found
3 merge requests!433Consolidate codegen and JIT modules.,!430Jupyter Inspection Framework, Book Theme, and Initial Drafts for Codegen Reference Guides,!429Iteration Slices: Extended GPU support + bugfixes
Pipeline #70307 passed
......@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
src/pystencils/plot.py
src/pystencils/session.py
src/pystencils/old
ignore = W293 W503 W291 C901 E741
ignore = W293 W503 W291 C901 E741 E704
......@@ -203,3 +203,8 @@ else:
return IPyNbFile.from_parent(fspath=path, parent=parent)
else:
return IPyNbFile(path, parent)
# Fixtures
from tests.fixtures import *
......@@ -144,7 +144,7 @@ class CudaPlatform(GenericGpu):
indexing_decls = []
conds = []
for i, dim in enumerate(dimensions):
for i, dim in enumerate(dimensions[::-1]):
dim.counter.dtype = constify(dim.counter.get_dtype())
ctr = PsExpression.make(dim.counter)
......@@ -161,6 +161,8 @@ class CudaPlatform(GenericGpu):
if not self._cfg.omit_range_check:
conds.append(PsLt(ctr, dim.stop))
indexing_decls = indexing_decls[::-1]
if conds:
condition: PsExpression = conds[0]
for cond in conds[1:]:
......@@ -214,72 +216,3 @@ class CudaPlatform(GenericGpu):
block_idx = BLOCK_IDX[coord]
thread_idx = THREAD_IDX[coord]
return block_idx * block_size + thread_idx
# class LinearIndexing:
# """Linear GPU thread indexing.
# This indexing scheme maps GPU threads to iteration space points in the following way:
# - Starting from the slowest coordinate, each coordinate is mapped to a dimension
# of the GPU grid until just one dimension is left
# - All remaining dimensions of the iteration space are linearly mapped
# onto the fastest launch grid dimension
# """
# def __init__(
# self,
# ctx: KernelCreationContext,
# launch_grid_dimensions: int,
# ispace: FullIterationSpace,
# ) -> None:
# if not (0 < launch_grid_dimensions <= 3):
# raise ValueError(
# f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
# )
# self._ctx = ctx
# self._grid_dims = launch_grid_dimensions
# self._ispace = ispace
# self._ispace_dims = len(ispace.dimensions)
# self._typify = Typifier(ctx)
# def get_counter_declarations(self) -> Sequence[PsDeclaration]:
# num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
# num_fast_dimensions = self._ispace_dims - num_slower_dimensions
# decls = []
# # Slower n dimensions
# for i in range(num_slower_dimensions, 0, -1):
# thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
# decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
# # Fastest dimensions
# thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
# if num_fast_dimensions == 1:
# decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
# else:
# for i in range(num_fast_dimensions, 0, -1):
# decls.append(
# self._make_ctr_decl(
# self._ispace.dimensions[i],
# # ergh... need actual iterations here...
# )
# )
# def _make_ctr_decl(
# self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
# ):
# dim.counter.dtype = constify(dim.counter.get_dtype())
# ctr = PsExpression.make(dim.counter)
# return self._typify(
# PsDeclaration(
# ctr,
# dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
# )
# )
......@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
from ..constants import PsConstant
from ..ast import PsAstNode
from ..ast.structural import PsLoop, PsBlock, PsDeclaration
from ..ast.expressions import PsExpression
from ..ast.expressions import PsExpression, PsTernary, PsGt
from ..ast.vector import PsVecBroadcast
from ..ast.analysis import collect_undefined_symbols
......@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols
class LoopVectorizer:
"""Vectorize loops.
The loop vectorizer provides methods to vectorize single loops inside an AST
using a given number of vector lanes.
During vectorization, the loop body is transformed using the `AstVectorizer`,
......@@ -64,29 +64,26 @@ class LoopVectorizer:
@overload
def vectorize_select_loops(
self, node: PsBlock, predicate: Callable[[PsLoop], bool]
) -> PsBlock:
...
) -> PsBlock: ...
@overload
def vectorize_select_loops(
self, node: PsLoop, predicate: Callable[[PsLoop], bool]
) -> PsLoop | PsBlock:
...
) -> PsLoop | PsBlock: ...
@overload
def vectorize_select_loops(
self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
) -> PsAstNode:
...
) -> PsAstNode: ...
def vectorize_select_loops(
self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
) -> PsAstNode:
"""Select and vectorize loops from a syntax tree according to a predicate.
Finds each loop inside a subtree and evaluates ``predicate`` on them.
If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
Loops nested inside a vectorized loop will not be processed.
Args:
......@@ -139,7 +136,7 @@ class LoopVectorizer:
# Generate vectorized loop body
simd_body = self._vectorize_ast(loop.body, vc)
if vector_ctr in collect_undefined_symbols(simd_body):
simd_body.statements.insert(0, vector_counter_decl)
......@@ -186,20 +183,31 @@ class LoopVectorizer:
trailing_start = self._ctx.get_new_symbol(
f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
)
trailing_start_decl = self._type_fold(
PsDeclaration(
PsExpression.make(trailing_start),
(
PsTernary(
# If at least one vectorized iteration took place...
PsGt(
PsExpression.make(simd_stop),
simd_start.clone(),
),
# start from the smallest non-valid multiple of simd_step, offset from simd_start
(
PsExpression.make(simd_stop)
- simd_start.clone()
- PsExpression.make(PsConstant(1))
(
PsExpression.make(simd_stop)
- simd_start.clone()
- PsExpression.make(PsConstant(1))
)
/ PsExpression.make(simd_step)
+ PsExpression.make(PsConstant(1))
)
/ PsExpression.make(simd_step)
+ PsExpression.make(PsConstant(1))
)
* PsExpression.make(simd_step)
+ simd_start.clone(),
* PsExpression.make(simd_step)
+ simd_start.clone(),
# otherwise start at zero
simd_start.clone(),
),
)
)
......
"""Fixtures for the pystencils test suite
This module provides a number of fixtures used by the pystencils test suite.
Use these fixtures wherever applicable to extend the code surface area covered
by your tests:
- All tests that should work for every target should use the `target` fixture
- All tests that should work with the highest optimization level for every target
should use the `gen_config` fixture
- Use the `xp` fixture to access the correct array module (numpy or cupy) depending
on the target
"""
import pytest
from types import ModuleType
from dataclasses import replace
import pystencils as ps
AVAILABLE_TARGETS = [ps.Target.GenericCPU]
try:
import cupy
AVAILABLE_TARGETS += [ps.Target.CUDA]
except ImportError:
pass
AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
def target(request) -> ps.Target:
"""Provides all code generation targets available on the current hardware"""
return request.param
@pytest.fixture
def gen_config(target: ps.Target):
"""Default codegen configuration for the current target.
For GPU targets, set default indexing options.
For vector-CPU targets, set default vectorization config.
"""
gen_config = ps.CreateKernelConfig(target=target)
if target.is_vector_cpu():
gen_config = replace(
gen_config,
cpu_optim=ps.CpuOptimConfig(
vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
),
)
return gen_config
@pytest.fixture()
def xp(target: ps.Target) -> ModuleType:
"""Primary array module for the current target.
Returns:
`cupy` if `target == Target.CUDA`, and `numpy` otherwise
"""
if target == ps.Target.CUDA:
import cupy as xp
return xp
else:
import numpy as np
return np
......@@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil
from pystencils.kernelcreation import create_kernel, KernelFunction
from pystencils.backend.emission import emit_code
AVAILABLE_TARGETS = [Target.GenericCPU]
try:
import cupy
AVAILABLE_TARGETS += [Target.CUDA]
except ImportError:
pass
AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
def gen_config(request):
target: Target = request.param
gen_config = CreateKernelConfig(target=target)
if Target._VECTOR in target:
gen_config = replace(
gen_config,
cpu_optim=CpuOptimConfig(
vectorize=VectorizationConfig(assume_inner_stride_one=True)
),
)
return gen_config
def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
code = emit_code(kernel)
......
import numpy as np
import sympy as sp
import pytest
from dataclasses import replace
from pystencils import (
DEFAULTS,
Assignment,
Field,
TypedSymbol,
create_kernel,
make_slice,
Target,
CreateKernelConfig,
GpuIndexingConfig,
DynamicType,
)
from pystencils.sympyextensions.integer_functions import int_rem
from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import normalize_slice
from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
def test_sliced_iteration():
size = (4, 4)
src_arr = np.ones(size)
dst_arr = np.zeros_like(src_arr)
src_field = Field.create_from_numpy_array("src", src_arr)
dst_field = Field.create_from_numpy_array("dst", dst_arr)
a, b = sp.symbols("a b")
update_rule = Assignment(
dst_field[0, 0],
(
a * src_field[0, 1]
+ a * src_field[0, -1]
+ b * src_field[1, 0]
+ b * src_field[-1, 0]
)
/ 4,
)
x_end = TypedSymbol("x_end", "int")
s = make_slice[1:x_end, 1]
x_end_value = size[1] - 1
kernel = create_kernel(
sympy_cse_on_assignment_list([update_rule]), iteration_slice=s
).compile()
kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
expected_result = np.zeros(size)
expected_result[1:x_end_value, 1] = 1
np.testing.assert_almost_equal(expected_result, dst_arr)
@pytest.mark.parametrize(
"islice",
[
make_slice[1:-1, 1:-1],
make_slice[3, 2:-2],
make_slice[2:-2:2, ::3],
make_slice[10:, :-5:2],
],
)
def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice):
shape = (16, 16)
f_arr = xp.zeros(shape)
expected = xp.zeros_like(f_arr)
expected[islice] = 1.0
f = Field.create_from_numpy_array("f", f_arr)
update = Assignment(f.center(), 1)
gen_config = replace(gen_config, iteration_slice=islice)
try:
kernel = create_kernel(update, gen_config).compile()
except NotImplementedError:
if gen_config.target.is_vector_cpu():
# TODO Gather/Scatter not implemented yet
pytest.xfail("Gather/Scatter not available yet")
kernel(f=f_arr)
xp.testing.assert_array_equal(f_arr, expected)
def test_symbolic_slice(gen_config: CreateKernelConfig, xp):
shape = (16, 16)
sx, sy, ex, ey = [
TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey")
]
f_arr = xp.zeros(shape)
f = Field.create_from_numpy_array("f", f_arr)
update = Assignment(f.center(), 1)
islice = make_slice[sy:ey, sx:ex]
gen_config = replace(gen_config, iteration_slice=islice)
kernel = create_kernel(update, gen_config).compile()
for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]:
slic = normalize_slice(slic, shape)
expected = xp.zeros_like(f_arr)
expected[slic] = 1.0
f_arr[:] = 0.0
kernel(
f=f_arr,
sy=slic[0].start,
ey=slic[0].stop,
sx=slic[1].start,
ex=slic[1].stop,
)
xp.testing.assert_array_equal(f_arr, expected)
def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
shape = (16, 16)
f_arr = xp.zeros(shape)
f = Field.create_from_numpy_array("f", f_arr)
expected = xp.zeros_like(f_arr)
for r in range(shape[0]):
expected[r, r:] = 1.0
update = Assignment(f.center(), 1)
outer_counter = DEFAULTS.spatial_counters[0]
islice = make_slice[:, outer_counter:]
gen_config = replace(gen_config, iteration_slice=islice)
if gen_config.target == Target.CUDA:
gen_config = replace(
gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
)
kernel = create_kernel(update, gen_config).compile()
if isinstance(kernel, CupyKernelWrapper):
kernel.block_size = shape + (1,)
kernel.num_blocks = (1, 1, 1)
kernel(f=f_arr)
xp.testing.assert_array_equal(f_arr, expected)
def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
shape = (16, 16)
f_arr = xp.zeros(shape)
f = Field.create_from_numpy_array("f", f_arr)
expected = xp.zeros_like(f_arr)
for r in range(shape[0]):
start = 0 if r % 2 == 0 else 1
expected[r, start::2] = 1.0
update = Assignment(f.center(), 1)
outer_counter = DEFAULTS.spatial_counters[0]
start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True))
islice = make_slice[:, start::2]
gen_config = replace(gen_config, iteration_slice=islice)
if gen_config.target == Target.CUDA:
gen_config = replace(
gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
)
try:
kernel = create_kernel(update, gen_config).compile()
except NotImplementedError:
if gen_config.target.is_vector_cpu():
pytest.xfail("Gather/Scatter not implemented yet")
if isinstance(kernel, CupyKernelWrapper):
kernel.block_size = (8, 16, 1)
kernel.num_blocks = (1, 1, 1)
kernel(f=f_arr)
xp.testing.assert_array_equal(f_arr, expected)
import numpy as np
import sympy as sp
from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
from pystencils.simp import sympy_cse_on_assignment_list
def test_sliced_iteration():
size = (4, 4)
src_arr = np.ones(size)
dst_arr = np.zeros_like(src_arr)
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
a, b = sp.symbols("a b")
update_rule = Assignment(dst_field[0, 0],
(a * src_field[0, 1] + a * src_field[0, -1] +
b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
x_end = TypedSymbol("x_end", "int")
s = make_slice[1:x_end, 1]
x_end_value = size[1] - 1
kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
expected_result = np.zeros(size)
expected_result[1:x_end_value, 1] = 1
np.testing.assert_almost_equal(expected_result, dst_arr)
......@@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list(
TEST_IDS = [t.name for t in TEST_SETUPS]
@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS)
def vectorization_setup(request) -> VectorTestSetup:
return request.param
def create_vector_kernel(
assignments: list[Assignment],
field: Field,
......@@ -139,9 +144,9 @@ def create_vector_kernel(
return kernel
@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
@pytest.mark.parametrize("ghost_layers", [0, 2])
def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int):
setup = vectorization_setup
src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx")
x = sp.symbols("x_:4")
......@@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
np.testing.assert_equal(dst_arr[:, -i, :], 0.0)
@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
def test_trailing_iterations(setup: VectorTestSetup):
def test_trailing_iterations(vectorization_setup: VectorTestSetup):
setup = vectorization_setup
f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
update = [Assignment(f(0), 2 * f(0))]
......@@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup):
kernel(f=f_arr)
np.testing.assert_equal(f_arr, 2.0)
def test_only_trailing_iterations(vectorization_setup: VectorTestSetup):
setup = vectorization_setup
f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
update = [Assignment(f(0), 2 * f(0))]
kernel = create_vector_kernel(update, f, setup)
for trailing_iters in range(1, setup.lanes):
shape = (trailing_iters, 1)
f_arr = create_numpy_array_with_layout(
shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype
)
f_arr[:] = 1.0
kernel(f=f_arr)
np.testing.assert_equal(f_arr, 2.0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment