diff --git a/.flake8 b/.flake8 index 3f946922a5f68e39bc02c65d4a5f4ffdb10e05e1..05758405a65664809b3a0a71893554a1af8e3381 100644 --- a/.flake8 +++ b/.flake8 @@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py, src/pystencils/plot.py src/pystencils/session.py src/pystencils/old -ignore = W293 W503 W291 C901 E741 +ignore = W293 W503 W291 C901 E741 E704 diff --git a/conftest.py b/conftest.py index 0c6c49153770bc40c90cb09e990d12561f7bad7a..4e8e2b73ab49ff1f881e755b19f9a46e07c0837e 100644 --- a/conftest.py +++ b/conftest.py @@ -203,3 +203,8 @@ else: return IPyNbFile.from_parent(fspath=path, parent=parent) else: return IPyNbFile(path, parent) + + +# Fixtures + +from tests.fixtures import * diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 5c8b5b5045b5f889123cdf86686ecd696c6d8bf7..dbade47d1b8fafcb55c12be0b07d674f6edb3ce3 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -144,7 +144,7 @@ class CudaPlatform(GenericGpu): indexing_decls = [] conds = [] - for i, dim in enumerate(dimensions): + for i, dim in enumerate(dimensions[::-1]): dim.counter.dtype = constify(dim.counter.get_dtype()) ctr = PsExpression.make(dim.counter) @@ -161,6 +161,8 @@ class CudaPlatform(GenericGpu): if not self._cfg.omit_range_check: conds.append(PsLt(ctr, dim.stop)) + indexing_decls = indexing_decls[::-1] + if conds: condition: PsExpression = conds[0] for cond in conds[1:]: @@ -214,72 +216,3 @@ class CudaPlatform(GenericGpu): block_idx = BLOCK_IDX[coord] thread_idx = THREAD_IDX[coord] return block_idx * block_size + thread_idx - - -# class LinearIndexing: -# """Linear GPU thread indexing. - -# This indexing scheme maps GPU threads to iteration space points in the following way: -# - Starting from the slowest coordinate, each coordinate is mapped to a dimension -# of the GPU grid until just one dimension is left -# - All remaining dimensions of the iteration space are linearly mapped -# onto the fastest launch grid dimension -# """ - -# def __init__( -# self, -# ctx: KernelCreationContext, -# launch_grid_dimensions: int, -# ispace: FullIterationSpace, -# ) -> None: -# if not (0 < launch_grid_dimensions <= 3): -# raise ValueError( -# f"Invalid number of launch grid dimensions: {launch_grid_dimensions}" -# ) - -# self._ctx = ctx - -# self._grid_dims = launch_grid_dimensions -# self._ispace = ispace -# self._ispace_dims = len(ispace.dimensions) - -# self._typify = Typifier(ctx) - -# def get_counter_declarations(self) -> Sequence[PsDeclaration]: -# num_slower_dimensions = min(self._grid_dims, self._ispace_dims) - 1 -# num_fast_dimensions = self._ispace_dims - num_slower_dimensions - -# decls = [] - -# # Slower n dimensions -# for i in range(num_slower_dimensions, 0, -1): -# thread_idx = BLOCK_IDX[i] * BLOCK_DIM[i] + THREAD_IDX[i] -# decls.append(self._make_ctr_decl(self._ispace.dimensions[num_fast_dimensions + i], thread_idx)) - -# # Fastest dimensions -# thread_idx = BLOCK_IDX[0] * BLOCK_DIM[0] + THREAD_IDX[0] - -# if num_fast_dimensions == 1: -# decls.append(self._make_ctr_decl(self._ispace.dimensions[0], thread_idx)) -# else: -# for i in range(num_fast_dimensions, 0, -1): -# decls.append( -# self._make_ctr_decl( -# self._ispace.dimensions[i], -# # ergh... need actual iterations here... -# ) -# ) - - -# def _make_ctr_decl( -# self, dim: FullIterationSpace.Dimension, thread_idx: PsExpression -# ): -# dim.counter.dtype = constify(dim.counter.get_dtype()) - -# ctr = PsExpression.make(dim.counter) -# return self._typify( -# PsDeclaration( -# ctr, -# dim.start + dim.step * PsCast(ctr.get_dtype(), thread_idx), -# ) -# ) diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index c89698193d9764996cfab7a5c5fd7e70358fbac6..e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -8,7 +8,7 @@ from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode from ..ast.structural import PsLoop, PsBlock, PsDeclaration -from ..ast.expressions import PsExpression +from ..ast.expressions import PsExpression, PsTernary, PsGt from ..ast.vector import PsVecBroadcast from ..ast.analysis import collect_undefined_symbols @@ -18,7 +18,7 @@ from .rewrite import substitute_symbols class LoopVectorizer: """Vectorize loops. - + The loop vectorizer provides methods to vectorize single loops inside an AST using a given number of vector lanes. During vectorization, the loop body is transformed using the `AstVectorizer`, @@ -64,29 +64,26 @@ class LoopVectorizer: @overload def vectorize_select_loops( self, node: PsBlock, predicate: Callable[[PsLoop], bool] - ) -> PsBlock: - ... + ) -> PsBlock: ... @overload def vectorize_select_loops( self, node: PsLoop, predicate: Callable[[PsLoop], bool] - ) -> PsLoop | PsBlock: - ... + ) -> PsLoop | PsBlock: ... @overload def vectorize_select_loops( self, node: PsAstNode, predicate: Callable[[PsLoop], bool] - ) -> PsAstNode: - ... + ) -> PsAstNode: ... def vectorize_select_loops( self, node: PsAstNode, predicate: Callable[[PsLoop], bool] ) -> PsAstNode: """Select and vectorize loops from a syntax tree according to a predicate. - + Finds each loop inside a subtree and evaluates ``predicate`` on them. If ``predicate(loop)`` evaluates to `True`, the loop is vectorized. - + Loops nested inside a vectorized loop will not be processed. Args: @@ -139,7 +136,7 @@ class LoopVectorizer: # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) - + if vector_ctr in collect_undefined_symbols(simd_body): simd_body.statements.insert(0, vector_counter_decl) @@ -186,20 +183,31 @@ class LoopVectorizer: trailing_start = self._ctx.get_new_symbol( f"__{scalar_ctr.name}_trailing_start", scalar_ctr.get_dtype() ) + trailing_start_decl = self._type_fold( PsDeclaration( PsExpression.make(trailing_start), - ( + PsTernary( + # If at least one vectorized iteration took place... + PsGt( + PsExpression.make(simd_stop), + simd_start.clone(), + ), + # start from the smallest non-valid multiple of simd_step, offset from simd_start ( - PsExpression.make(simd_stop) - - simd_start.clone() - - PsExpression.make(PsConstant(1)) + ( + PsExpression.make(simd_stop) + - simd_start.clone() + - PsExpression.make(PsConstant(1)) + ) + / PsExpression.make(simd_step) + + PsExpression.make(PsConstant(1)) ) - / PsExpression.make(simd_step) - + PsExpression.make(PsConstant(1)) - ) - * PsExpression.make(simd_step) - + simd_start.clone(), + * PsExpression.make(simd_step) + + simd_start.clone(), + # otherwise start at zero + simd_start.clone(), + ), ) ) diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 0000000000000000000000000000000000000000..7c95216147d02b7943b0c218af6fbb5b5175559b --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,71 @@ +"""Fixtures for the pystencils test suite + +This module provides a number of fixtures used by the pystencils test suite. +Use these fixtures wherever applicable to extend the code surface area covered +by your tests: + +- All tests that should work for every target should use the `target` fixture +- All tests that should work with the highest optimization level for every target + should use the `gen_config` fixture +- Use the `xp` fixture to access the correct array module (numpy or cupy) depending + on the target + +""" + +import pytest + +from types import ModuleType +from dataclasses import replace + +import pystencils as ps + +AVAILABLE_TARGETS = [ps.Target.GenericCPU] + +try: + import cupy + + AVAILABLE_TARGETS += [ps.Target.CUDA] +except ImportError: + pass + +AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() +TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] + +@pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS) +def target(request) -> ps.Target: + """Provides all code generation targets available on the current hardware""" + return request.param + +@pytest.fixture +def gen_config(target: ps.Target): + """Default codegen configuration for the current target. + + For GPU targets, set default indexing options. + For vector-CPU targets, set default vectorization config. + """ + + gen_config = ps.CreateKernelConfig(target=target) + + if target.is_vector_cpu(): + gen_config = replace( + gen_config, + cpu_optim=ps.CpuOptimConfig( + vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) + ), + ) + + return gen_config + +@pytest.fixture() +def xp(target: ps.Target) -> ModuleType: + """Primary array module for the current target. + + Returns: + `cupy` if `target == Target.CUDA`, and `numpy` otherwise + """ + if target == ps.Target.CUDA: + import cupy as xp + return xp + else: + import numpy as np + return np diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py index b9cebb3548851af29406acb93a3a4ea14127e35e..d02bfd8e46e8fc8c8f19bcebffc0db52787ff1bd 100644 --- a/tests/kernelcreation/test_domain_kernels.py +++ b/tests/kernelcreation/test_domain_kernels.py @@ -18,35 +18,6 @@ from pystencils.assignment import assignment_from_stencil from pystencils.kernelcreation import create_kernel, KernelFunction from pystencils.backend.emission import emit_code -AVAILABLE_TARGETS = [Target.GenericCPU] - -try: - import cupy - - AVAILABLE_TARGETS += [Target.CUDA] -except ImportError: - pass - -AVAILABLE_TARGETS += Target.available_vector_cpu_targets() -TEST_IDS = [t.name for t in AVAILABLE_TARGETS] - - -@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS) -def gen_config(request): - target: Target = request.param - - gen_config = CreateKernelConfig(target=target) - - if Target._VECTOR in target: - gen_config = replace( - gen_config, - cpu_optim=CpuOptimConfig( - vectorize=VectorizationConfig(assume_inner_stride_one=True) - ), - ) - - return gen_config - def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig): code = emit_code(kernel) diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py new file mode 100644 index 0000000000000000000000000000000000000000..94ed0295441885cb4ffc87855181659383b1fde7 --- /dev/null +++ b/tests/kernelcreation/test_iteration_slices.py @@ -0,0 +1,190 @@ +import numpy as np +import sympy as sp +import pytest + +from dataclasses import replace + +from pystencils import ( + DEFAULTS, + Assignment, + Field, + TypedSymbol, + create_kernel, + make_slice, + Target, + CreateKernelConfig, + GpuIndexingConfig, + DynamicType, +) +from pystencils.sympyextensions.integer_functions import int_rem +from pystencils.simp import sympy_cse_on_assignment_list +from pystencils.slicing import normalize_slice +from pystencils.backend.jit.gpu_cupy import CupyKernelWrapper + + +def test_sliced_iteration(): + size = (4, 4) + src_arr = np.ones(size) + dst_arr = np.zeros_like(src_arr) + src_field = Field.create_from_numpy_array("src", src_arr) + dst_field = Field.create_from_numpy_array("dst", dst_arr) + + a, b = sp.symbols("a b") + update_rule = Assignment( + dst_field[0, 0], + ( + a * src_field[0, 1] + + a * src_field[0, -1] + + b * src_field[1, 0] + + b * src_field[-1, 0] + ) + / 4, + ) + + x_end = TypedSymbol("x_end", "int") + s = make_slice[1:x_end, 1] + x_end_value = size[1] - 1 + kernel = create_kernel( + sympy_cse_on_assignment_list([update_rule]), iteration_slice=s + ).compile() + + kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value) + + expected_result = np.zeros(size) + expected_result[1:x_end_value, 1] = 1 + np.testing.assert_almost_equal(expected_result, dst_arr) + + +@pytest.mark.parametrize( + "islice", + [ + make_slice[1:-1, 1:-1], + make_slice[3, 2:-2], + make_slice[2:-2:2, ::3], + make_slice[10:, :-5:2], + ], +) +def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice): + shape = (16, 16) + + f_arr = xp.zeros(shape) + expected = xp.zeros_like(f_arr) + expected[islice] = 1.0 + + f = Field.create_from_numpy_array("f", f_arr) + + update = Assignment(f.center(), 1) + gen_config = replace(gen_config, iteration_slice=islice) + + try: + kernel = create_kernel(update, gen_config).compile() + except NotImplementedError: + if gen_config.target.is_vector_cpu(): + # TODO Gather/Scatter not implemented yet + pytest.xfail("Gather/Scatter not available yet") + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_symbolic_slice(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + sx, sy, ex, ey = [ + TypedSymbol(n, DynamicType.INDEX_TYPE) for n in ("sx", "sy", "ex", "ey") + ] + + f_arr = xp.zeros(shape) + + f = Field.create_from_numpy_array("f", f_arr) + + update = Assignment(f.center(), 1) + islice = make_slice[sy:ey, sx:ex] + gen_config = replace(gen_config, iteration_slice=islice) + kernel = create_kernel(update, gen_config).compile() + + for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]: + slic = normalize_slice(slic, shape) + expected = xp.zeros_like(f_arr) + expected[slic] = 1.0 + + f_arr[:] = 0.0 + + kernel( + f=f_arr, + sy=slic[0].start, + ey=slic[0].stop, + sx=slic[1].start, + ex=slic[1].stop, + ) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_triangle_pattern(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + f_arr = xp.zeros(shape) + f = Field.create_from_numpy_array("f", f_arr) + + expected = xp.zeros_like(f_arr) + for r in range(shape[0]): + expected[r, r:] = 1.0 + + update = Assignment(f.center(), 1) + outer_counter = DEFAULTS.spatial_counters[0] + islice = make_slice[:, outer_counter:] + gen_config = replace(gen_config, iteration_slice=islice) + + if gen_config.target == Target.CUDA: + gen_config = replace( + gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + ) + + kernel = create_kernel(update, gen_config).compile() + + if isinstance(kernel, CupyKernelWrapper): + kernel.block_size = shape + (1,) + kernel.num_blocks = (1, 1, 1) + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) + + +def test_red_black_pattern(gen_config: CreateKernelConfig, xp): + shape = (16, 16) + + f_arr = xp.zeros(shape) + f = Field.create_from_numpy_array("f", f_arr) + + expected = xp.zeros_like(f_arr) + for r in range(shape[0]): + start = 0 if r % 2 == 0 else 1 + expected[r, start::2] = 1.0 + + update = Assignment(f.center(), 1) + outer_counter = DEFAULTS.spatial_counters[0] + start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True)) + islice = make_slice[:, start::2] + gen_config = replace(gen_config, iteration_slice=islice) + + if gen_config.target == Target.CUDA: + gen_config = replace( + gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + ) + + try: + kernel = create_kernel(update, gen_config).compile() + except NotImplementedError: + if gen_config.target.is_vector_cpu(): + pytest.xfail("Gather/Scatter not implemented yet") + + if isinstance(kernel, CupyKernelWrapper): + kernel.block_size = (8, 16, 1) + kernel.num_blocks = (1, 1, 1) + + kernel(f=f_arr) + + xp.testing.assert_array_equal(f_arr, expected) diff --git a/tests/kernelcreation/test_sliced_iteration.py b/tests/kernelcreation/test_sliced_iteration.py deleted file mode 100644 index 5eff0a89d4d23386d00e7408a19ece93453d7d9d..0000000000000000000000000000000000000000 --- a/tests/kernelcreation/test_sliced_iteration.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np -import sympy as sp - -from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice -from pystencils.simp import sympy_cse_on_assignment_list - - -def test_sliced_iteration(): - size = (4, 4) - src_arr = np.ones(size) - dst_arr = np.zeros_like(src_arr) - src_field = Field.create_from_numpy_array('src', src_arr) - dst_field = Field.create_from_numpy_array('dst', dst_arr) - - a, b = sp.symbols("a b") - update_rule = Assignment(dst_field[0, 0], - (a * src_field[0, 1] + a * src_field[0, -1] + - b * src_field[1, 0] + b * src_field[-1, 0]) / 4) - - x_end = TypedSymbol("x_end", "int") - s = make_slice[1:x_end, 1] - x_end_value = size[1] - 1 - kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile() - - kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value) - - expected_result = np.zeros(size) - expected_result[1:x_end_value, 1] = 1 - np.testing.assert_almost_equal(expected_result, dst_arr) diff --git a/tests/nbackend/test_vectorization.py b/tests/nbackend/test_vectorization.py index aeb0ebaa0601537331d4daaee1bfcbfdcccb2e6a..55330c9ee8d5d675379418748aa085ab4ce3ae73 100644 --- a/tests/nbackend/test_vectorization.py +++ b/tests/nbackend/test_vectorization.py @@ -89,6 +89,11 @@ TEST_SETUPS: list[VectorTestSetup] = list( TEST_IDS = [t.name for t in TEST_SETUPS] +@pytest.fixture(params=TEST_SETUPS, ids=TEST_IDS) +def vectorization_setup(request) -> VectorTestSetup: + return request.param + + def create_vector_kernel( assignments: list[Assignment], field: Field, @@ -139,9 +144,9 @@ def create_vector_kernel( return kernel -@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS) @pytest.mark.parametrize("ghost_layers", [0, 2]) -def test_update_kernel(setup: VectorTestSetup, ghost_layers: int): +def test_update_kernel(vectorization_setup: VectorTestSetup, ghost_layers: int): + setup = vectorization_setup src, dst = fields(f"src(2), dst(4): {setup.numeric_dtype}[2D]", layout="fzyx") x = sp.symbols("x_:4") @@ -197,8 +202,8 @@ def test_update_kernel(setup: VectorTestSetup, ghost_layers: int): np.testing.assert_equal(dst_arr[:, -i, :], 0.0) -@pytest.mark.parametrize("setup", TEST_SETUPS, ids=TEST_IDS) -def test_trailing_iterations(setup: VectorTestSetup): +def test_trailing_iterations(vectorization_setup: VectorTestSetup): + setup = vectorization_setup f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx") update = [Assignment(f(0), 2 * f(0))] @@ -216,3 +221,24 @@ def test_trailing_iterations(setup: VectorTestSetup): kernel(f=f_arr) np.testing.assert_equal(f_arr, 2.0) + + +def test_only_trailing_iterations(vectorization_setup: VectorTestSetup): + setup = vectorization_setup + f = fields(f"f(1): {setup.numeric_dtype}[1D]", layout="fzyx") + + update = [Assignment(f(0), 2 * f(0))] + + kernel = create_vector_kernel(update, f, setup) + + for trailing_iters in range(1, setup.lanes): + shape = (trailing_iters, 1) + f_arr = create_numpy_array_with_layout( + shape, layout=(1, 0), dtype=setup.numeric_dtype.numpy_dtype + ) + + f_arr[:] = 1.0 + + kernel(f=f_arr) + + np.testing.assert_equal(f_arr, 2.0)