Add test cases for various sliced iterations. Fix trailing iters in loop...

Add test cases for various sliced iterations. Fix trailing iters in loop vectorizer when simd loop is never entered. Extract default gen configs to fixtures.

Add test cases for various sliced iterations. Fix trailing iters in loop...
617a9282 · Frederik Hennig · f6769587 · 617a9282 · 617a9282 · 617a9282
Commit 617a9282 authored 7 months ago by Frederik Hennig
--- a/.flake8
+++ b/.flake8
@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
        src/pystencils/plot.py
        src/pystencils/session.py
        src/pystencils/old
-ignore = W293 W503 W291 C901 E741
+ignore = W293 W503 W291 C901 E741 E704
--- a/conftest.py
+++ b/conftest.py
@@ -203,3 +203,8 @@ else:
                return IPyNbFile.from_parent(fspath=path, parent=parent)
            else:
                return IPyNbFile(path, parent)
+
+
+#   Fixtures
+
+from tests.fixtures import *
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -144,7 +144,7 @@ class CudaPlatform(GenericGpu):

        indexing_decls = []
        conds = []
-        for i, dim in enumerate(dimensions):
+        for i, dim in enumerate(dimensions[::-1]):
            dim.counter.dtype = constify(dim.counter.get_dtype())

            ctr = PsExpression.make(dim.counter)
@@ -161,6 +161,8 @@ class CudaPlatform(GenericGpu):
            if not self._cfg.omit_range_check:
                conds.append(PsLt(ctr, dim.stop))

+        indexing_decls = indexing_decls[::-1]
+
        if conds:
            condition: PsExpression = conds[0]
            for cond in conds[1:]:
@@ -214,72 +216,3 @@ class CudaPlatform(GenericGpu):
        block_idx = BLOCK_IDX[coord]
        thread_idx = THREAD_IDX[coord]
        return block_idx * block_size + thread_idx
-
-
-# class LinearIndexing:
-#     """Linear GPU thread indexing.
-
-#     This indexing scheme maps GPU threads to iteration space points in the following way:
-#     - Starting from the slowest coordinate, each coordinate is mapped to a dimension
-#       of the GPU grid until just one dimension is left
-#     - All remaining dimensions of the iteration space are linearly mapped
-#       onto the fastest launch grid dimension
-#     """
-
-#     def __init__(
-#         self,
-#         ctx: KernelCreationContext,
-#         launch_grid_dimensions: int,
-#         ispace: FullIterationSpace,
-#     ) -> None:
-#         if not (0 < launch_grid_dimensions <= 3):
-#             raise ValueError(
-#                 f"Invalid number of launch grid dimensions: {launch_grid_dimensions}"
-#             )
-
-#         self._ctx = ctx
-
-#         self._grid_dims = launch_grid_dimensions
-#         self._ispace = ispace
-#         self._ispace_dims = len(ispace.dimensions)
-
-#         self._typify = Typifier(ctx)
-
-#     def get_counter_declarations(self) -> Sequence[PsDeclaration]:
-#         num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1
-#         num_fast_dimensions = self._ispace_dims - num_slower_dimensions
-
-#         decls = []
-
-#         #   Slower n dimensions
-#         for i in range(num_slower_dimensions, 0, -1):
-#             thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i]
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx))
-
-#         #   Fastest dimensions
-#         thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0]
-
-#         if num_fast_dimensions == 1:
-#             decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx))
-#         else:
-#             for i in range(num_fast_dimensions, 0, -1):
-#                 decls.append(
-#                     self._make_ctr_decl(
-#                         self._ispace.dimensions[i],
-#                         #   ergh... need actual iterations here...
-#                     )
-#                 )
-
-
-#     def _make_ctr_decl(
-#         self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression
-#     ):
-#         dim.counter.dtype = constify(dim.counter.get_dtype())
-
-#         ctr = PsExpression.make(dim.counter)
-#         return self._typify(
-#             PsDeclaration(
-#                 ctr,
-#                 dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx),
-#             )
-#         )
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
 from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsTernary, PsGt
 from ..ast.vector import PsVecBroadcast
 from ..ast.analysis import collect_undefined_symbols

@@ -18,7 +18,7 @@ from .rewrite import substitute_symbols

 class LoopVectorizer:
    """Vectorize loops.
-    
+
    The loop vectorizer provides methods to vectorize single loops inside an AST
    using a given number of vector lanes.
    During vectorization, the loop body is transformed using the `AstVectorizer`,
@@ -64,29 +64,26 @@ class LoopVectorizer:
    @overload
    def vectorize_select_loops(
        self, node: PsBlock, predicate: Callable[[PsLoop], bool]
-    ) -> PsBlock:
-        ...
+    ) -> PsBlock: ...

    @overload
    def vectorize_select_loops(
        self, node: PsLoop, predicate: Callable[[PsLoop], bool]
-    ) -> PsLoop | PsBlock:
-        ...
+    ) -> PsLoop | PsBlock: ...

    @overload
    def vectorize_select_loops(
        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
-    ) -> PsAstNode:
-        ...
+    ) -> PsAstNode: ...

    def vectorize_select_loops(
        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
    ) -> PsAstNode:
        """Select and vectorize loops from a syntax tree according to a predicate.
-        
+
        Finds each loop inside a subtree and evaluates ``predicate`` on them.
        If ``predicate(loop)`` evaluates to `True`, the loop is vectorized.
-        
+
        Loops nested inside a vectorized loop will not be processed.

        Args:
@@ -139,7 +136,7 @@ class LoopVectorizer:

        #   Generate vectorized loop body
        simd_body = self._vectorize_ast(loop.body, vc)
-        
+
        if vector_ctr in collect_undefined_symbols(simd_body):
            simd_body.statements.insert(0, vector_counter_decl)

@@ -186,20 +183,31 @@ class LoopVectorizer:
                trailing_start = self._ctx.get_new_symbol(
                    f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype()
                )
+
                trailing_start_decl = self._type_fold(
                    PsDeclaration(
                        PsExpression.make(trailing_start),
-                        (
+                        PsTernary(
+                            #   If at least one vectorized iteration took place...
+                            PsGt(
+                                PsExpression.make(simd_stop),
+                                simd_start.clone(),
+                            ),
+                            #   start from the smallest non-valid multiple of simd_step, offset from simd_start
                            (
-                                PsExpression.make(simd_stop)
-                                - simd_start.clone()
-                                - PsExpression.make(PsConstant(1))
+                                (
+                                    PsExpression.make(simd_stop)
+                                    - simd_start.clone()
+                                    - PsExpression.make(PsConstant(1))
+                                )
+                                / PsExpression.make(simd_step)
+                                + PsExpression.make(PsConstant(1))
                            )
-                            / PsExpression.make(simd_step)
-                            + PsExpression.make(PsConstant(1))
-                        )
-                        * PsExpression.make(simd_step)
-                        + simd_start.clone(),
+                            * PsExpression.make(simd_step)
+                            + simd_start.clone(),
+                            #   otherwise start at zero
+                            simd_start.clone(),
+                        ),
                    )
                )


--- a/tests/fixtures.py
+++ b/tests/fixtures.py
+"""Fixtures for the pystencils test suite
+
+This module provides a number of fixtures used by the pystencils test suite.
+Use these fixtures wherever applicable to extend the code surface area covered
+by your tests:
+
+- All tests that should work for every target should use the `target` fixture
+- All tests that should work with the highest optimization level for every target
+  should use the `gen_config` fixture
+- Use the `xp` fixture to access the correct array module (numpy or cupy) depending
+  on the target
+
+"""
+
+import pytest
+
+from types import ModuleType
+from dataclasses import replace
+
+import pystencils as ps
+
+AVAILABLE_TARGETS = [ps.Target.GenericCPU]
+
+try:
+    import cupy
+
+    AVAILABLE_TARGETS += [ps.Target.CUDA]
+except ImportError:
+    pass
+
+AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
+
+@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS)
+def target(request) -> ps.Target:
+    """Provides all code generation targets available on the current hardware"""
+    return request.param
+
+@pytest.fixture
+def gen_config(target: ps.Target):
+    """Default codegen configuration for the current target.
+    
+    For GPU targets, set default indexing options.
+    For vector-CPU targets, set default vectorization config.
+    """
+
+    gen_config = ps.CreateKernelConfig(target=target)
+
+    if target.is_vector_cpu():
+        gen_config = replace(
+            gen_config,
+            cpu_optim=ps.CpuOptimConfig(
+                vectorize=ps.VectorizationConfig(assume_inner_stride_one=True)
+            ),
+        )
+
+    return gen_config
+
+@pytest.fixture()
+def xp(target: ps.Target) -> ModuleType:
+    """Primary array module for the current target.
+    
+    Returns:
+        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+    """
+    if target == ps.Target.CUDA:
+        import cupy as xp
+        return xp
+    else:
+        import numpy as np
+        return np
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil
 from pystencils.kernelcreation import create_kernel, KernelFunction
 from pystencils.backend.emission import emit_code

-AVAILABLE_TARGETS = [Target.GenericCPU]
-
-try:
-    import cupy
-
-    AVAILABLE_TARGETS += [Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
-TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
-
-
-@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
-def gen_config(request):
-    target: Target = request.param
-
-    gen_config = CreateKernelConfig(target=target)
-
-    if Target._VECTOR in target:
-        gen_config = replace(
-            gen_config,
-            cpu_optim=CpuOptimConfig(
-                vectorize=VectorizationConfig(assume_inner_stride_one=True)
-            ),
-        )
-
-    return gen_config
-

 def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
    code = emit_code(kernel)

--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
+import numpy as np
+import sympy as sp
+import pytest
+
+from dataclasses import replace
+
+from pystencils import (
+    DEFAULTS,
+    Assignment,
+    Field,
+    TypedSymbol,
+    create_kernel,
+    make_slice,
+    Target,
+    CreateKernelConfig,
+    GpuIndexingConfig,
+    DynamicType,
+)
+from pystencils.sympyextensions.integer_functions import int_rem
+from pystencils.simp import sympy_cse_on_assignment_list
+from pystencils.slicing import normalize_slice
+from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper
+
+
+def test_sliced_iteration():
+    size = (4, 4)
+    src_arr = np.ones(size)
+    dst_arr = np.zeros_like(src_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
+
+    a, b = sp.symbols("a b")
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (
+            a * src_field[0, 1]
+            + a * src_field[0, -1]
+            + b * src_field[1, 0]
+            + b * src_field[-1, 0]
+        )
+        / 4,
+    )
+
+    x_end = TypedSymbol("x_end", "int")
+    s = make_slice[1:x_end, 1]
+    x_end_value = size[1] - 1
+    kernel = create_kernel(
+        sympy_cse_on_assignment_list([update_rule]), iteration_slice=s
+    ).compile()
+
+    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
+
+    expected_result = np.zeros(size)
+    expected_result[1:x_end_value, 1] = 1
+    np.testing.assert_almost_equal(expected_result, dst_arr)
+
+
+@pytest.mark.parametrize(
+    "islice",
+    [
+        make_slice[1:-1, 1:-1],
+        make_slice[3, 2:-2],
+        make_slice[2:-2:2, ::3],
+        make_slice[10:, :-5:2],
+    ],
+)
+def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    expected = xp.zeros_like(f_arr)
+    expected[islice] = 1.0
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            #   TODO Gather/Scatter not implemented yet
+            pytest.xfail("Gather/Scatter not available yet")
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_symbolic_slice(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    sx, sy, ex, ey = [
+        TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey")
+    ]
+
+    f_arr = xp.zeros(shape)
+
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    update = Assignment(f.center(), 1)
+    islice = make_slice[sy:ey, sx:ex]
+    gen_config = replace(gen_config, iteration_slice=islice)
+    kernel = create_kernel(update, gen_config).compile()
+
+    for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]:
+        slic = normalize_slice(slic, shape)
+        expected = xp.zeros_like(f_arr)
+        expected[slic] = 1.0
+
+        f_arr[:] = 0.0
+
+        kernel(
+            f=f_arr,
+            sy=slic[0].start,
+            ey=slic[0].stop,
+            sx=slic[1].start,
+            ex=slic[1].stop,
+        )
+
+        xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        expected[r, r:] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, outer_counter:]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    kernel = create_kernel(update, gen_config).compile()
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = shape + (1,)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
+
+
+def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
+    shape = (16, 16)
+
+    f_arr = xp.zeros(shape)
+    f = Field.create_from_numpy_array("f", f_arr)
+
+    expected = xp.zeros_like(f_arr)
+    for r in range(shape[0]):
+        start = 0 if r % 2 == 0 else 1
+        expected[r, start::2] = 1.0
+
+    update = Assignment(f.center(), 1)
+    outer_counter = DEFAULTS.spatial_counters[0]
+    start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True))
+    islice = make_slice[:, start::2]
+    gen_config = replace(gen_config, iteration_slice=islice)
+
+    if gen_config.target == Target.CUDA:
+        gen_config = replace(
+            gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True)
+        )
+
+    try:
+        kernel = create_kernel(update, gen_config).compile()
+    except NotImplementedError:
+        if gen_config.target.is_vector_cpu():
+            pytest.xfail("Gather/Scatter not implemented yet")
+
+    if isinstance(kernel, CupyKernelWrapper):
+        kernel.block_size = (8, 16, 1)
+        kernel.num_blocks = (1, 1, 1)
+
+    kernel(f=f_arr)
+
+    xp.testing.assert_array_equal(f_arr, expected)
--- a/tests/kernelcreation/test_sliced_iteration.py
+++ b/tests/kernelcreation/test_sliced_iteration.py
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-
-
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list(
 TEST_IDS = [t.name for t in TEST_SETUPS]


+@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS)
+def vectorization_setup(request) -> VectorTestSetup:
+    return request.param
+
+
 def create_vector_kernel(
    assignments: list[Assignment],
    field: Field,
@@ -139,9 +144,9 @@ def create_vector_kernel(
    return kernel


-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
 @pytest.mark.parametrize("ghost_layers", [0, 2])
-def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
+def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int):
+    setup = vectorization_setup
    src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx")

    x = sp.symbols("x_:4")
@@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int):
            np.testing.assert_equal(dst_arr[:, -i, :], 0.0)


-@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS)
-def test_trailing_iterations(setup: VectorTestSetup):
+def test_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
    f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")

    update = [Assignment(f(0), 2 * f(0))]
@@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup):
        kernel(f=f_arr)

        np.testing.assert_equal(f_arr, 2.0)
+
+
+def test_only_trailing_iterations(vectorization_setup: VectorTestSetup):
+    setup = vectorization_setup
+    f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx")
+
+    update = [Assignment(f(0), 2 * f(0))]
+
+    kernel = create_vector_kernel(update, f, setup)
+
+    for trailing_iters in range(1, setup.lanes):
+        shape = (trailing_iters, 1)
+        f_arr = create_numpy_array_with_layout(
+            shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype
+        )
+
+        f_arr[:] = 1.0
+
+        kernel(f=f_arr)
+
+        np.testing.assert_equal(f_arr, 2.0)