693d83da · 693d83da · 693d83da · 693d83da · 693d83da · 693d83da
--- a/src/pystencils/include/riscv_v_helpers.h
+++ b/src/pystencils/include/riscv_v_helpers.h
+/*
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 inline void cachelineZero(void * p) {
 #ifdef __riscv_zicboz
 	__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");

--- a/src/pystencils/kernel_contrains_check.py
+++ b/src/pystencils/kernel_contrains_check.py
@@ -38,6 +38,7 @@ class KernelConstraintsCheck:
    def __init__(self, check_independence_condition=True, check_double_write_condition=True):
        self.scopes = NestedScopes()
+        self.field_reads = defaultdict(set)
        self.field_writes = defaultdict(set)
        self.fields_read = set()
        self.check_independence_condition = check_independence_condition
@@ -111,6 +112,13 @@ class KernelConstraintsCheck:
            if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
                raise ValueError(
                    f"Field {lhs.field.name} is written at two different locations")
+            if fai in self.field_reads:
+                reads = tuple(self.field_reads[fai])
+                if len(reads) > 1 or lhs.offsets != reads[0]:
+                    if self.check_independence_condition:
+                        raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
+                                         f"This means the resulting kernel would not be thread safe")
        elif isinstance(lhs, sp.Symbol):
            if self.scopes.is_defined_locally(lhs):
                raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
@@ -120,8 +128,9 @@ class KernelConstraintsCheck:
    def update_accesses_rhs(self, rhs):
        if isinstance(rhs, Field.Access) and self.check_independence_condition:
-            writes = self.field_writes[self.FieldAndIndex(
+            fai = self.FieldAndIndex(rhs.field, rhs.index)
-                rhs.field, rhs.index)]
+            writes = self.field_writes[fai]
+            self.field_reads[fai].add(rhs.offsets)
            for write_offset in writes:
                assert len(writes) == 1
                if write_offset != rhs.offsets:

--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -128,6 +128,7 @@ def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelCon
    # --- check constrains
    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
                                   check_double_write_condition=not config.allow_double_writes)
    check.visit(assignments)
    assignments.bound_fields = check.fields_written

--- a/src/pystencils/typing/leaf_typing.py
+++ b/src/pystencils/typing/leaf_typing.py
@@ -11,6 +11,7 @@ from sympy.core.relational import Relational
 from sympy.functions.elementary.piecewise import ExprCondPair
 from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
 from sympy.functions.elementary.hyperbolic import HyperbolicFunction
+from sympy.functions.elementary.integers import RoundFunction
 from sympy.logic.boolalg import BooleanFunction
 from sympy.logic.boolalg import BooleanAtom
@@ -213,7 +214,7 @@ class TypeAdder:
                    new_args.append(a)
            return expr.func(*new_args) if new_args else expr, collated_type
        elif isinstance(expr, (sp.Pow, sp.exp, InverseTrigonometricFunction, TrigonometricFunction,
-                               HyperbolicFunction, sp.log)):
+                               HyperbolicFunction, sp.log, RoundFunction)):
            args_types = [self.figure_out_type(arg) for arg in expr.args]
            collated_type = collate_types([t for _, t in args_types])
            new_args = [a if t.dtype_eq(collated_type) else CastFunc(a, collated_type) for a, t in args_types]

--- a/src/pystencils/typing/types.py
+++ b/src/pystencils/typing/types.py
@@ -7,7 +7,7 @@ import sympy as sp
 def is_supported_type(dtype: np.dtype):
    scalar = dtype.type
-    c = np.issctype(dtype)
+    c = np.issubdtype(dtype, np.generic)
    subclass = issubclass(scalar, np.floating) or issubclass(scalar, np.integer) or issubclass(scalar, np.bool_)
    additional_checks = dtype.fields is None and dtype.hasobject is False and dtype.subdtype is None
    return c and subclass and additional_checks

--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -82,8 +82,8 @@ def boolean_array_bounding_box(boolean_array):
    >>> a = np.zeros((4, 4), dtype=bool)
    >>> a[1:-1, 1:-1] = True
-    >>> boolean_array_bounding_box(a)
+    >>> boolean_array_bounding_box(a) == [(1, 3), (1, 3)]
-    [(1, 3), (1, 3)]
+    True
    """
    dim = boolean_array.ndim
    shape = boolean_array.shape

--- a/tests/test_conditional_vec.py
+++ b/tests/test_conditional_vec.py
@@ -3,6 +3,7 @@ import sympy as sp
 import pytest
 import pystencils as ps
+from pystencils.alignedarray import aligned_zeros
 from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.enums import Target
@@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_any(instruction_set, dtype):
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 4  # we don't know the actual value
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype):
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that the first value has changed
        np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
    else:
@@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype):
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_all(instruction_set, dtype):
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 1000  # we don't know the actual value, need something guaranteed larger than vector
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
@@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype):
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'sme', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that some values in the middle have been replaced
        assert np.all(data_arr[3:9, :2] <= 1.0)
        assert np.any(data_arr[3:9, 2:] == 2.0)
@@ -94,16 +95,60 @@ def test_boolean_before_loop():
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
-def test_vec_maskstore(instruction_set, dtype):
+@pytest.mark.parametrize('nontemporal', [False, True])
-    data_arr = np.zeros((16, 16), dtype=dtype)
+@pytest.mark.parametrize('aligned', [False, True])
+def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
+    data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
    data_arr[3:-3, 3:-3] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
    assignmets = NodeCollection(c)
-    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal,
+                                                       'assume_aligned': aligned},
+                                   default_number_float=dtype)
    ast = ps.create_kernel(assignmets, config=config)
+    if 'maskStore' in ast.instruction_set:
+        instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
+                      'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_vec_maskscatter(instruction_set, dtype, nontemporal):
+    data_arr = np.zeros((16, 16), dtype=dtype)
+    data_arr[3:-3, 3:-3] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]")
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal},
+                                   default_number_float=dtype)
+    if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
+            and not instruction_set.startswith('sve'):
+        with pytest.warns(UserWarning) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert 'Could not vectorize loop' in warn[0].message.args[0]
+    else:
+        with pytest.warns(None) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert len(warn) == 0
+        instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
    print(ps.get_code_str(ast))
    kernel = ast.compile()
    kernel(data=data_arr)

--- a/tests/test_gpu.py
+++ b/tests/test_gpu.py
 import pytest
 import numpy as np
-import cupy as cp
 import sympy as sp
+import math
 from scipy.ndimage import convolve
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
+from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target, get_code_str
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
 try:
-    import cupy
+    import cupy as cp
-    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+    device_numbers = range(cp.cuda.runtime.getDeviceCount())
 except ImportError:
    device_numbers = []
+    cp = None
 def test_averaging_kernel():
+    pytest.importorskip('cupy')
    size = (40, 55)
    src_arr = np.random.rand(*size)
    src_arr = add_ghost_layers(src_arr)
@@ -44,6 +46,7 @@ def test_averaging_kernel():
 def test_variable_sized_fields():
+    pytest.importorskip('cupy')
    src_field = Field.create_generic('src', spatial_dimensions=2)
    dst_field = Field.create_generic('dst', spatial_dimensions=2)
@@ -71,6 +74,7 @@ def test_variable_sized_fields():
 def test_multiple_index_dimensions():
+    pytest.importorskip('cupy')
    """Sums along the last axis of a numpy array"""
    src_size = (7, 6, 4)
    dst_size = src_size[:2]
@@ -103,6 +107,7 @@ def test_multiple_index_dimensions():
 def test_ghost_layer():
+    pytest.importorskip('cupy')
    size = (6, 5)
    src_arr = np.ones(size)
    dst_arr = np.zeros_like(src_arr)
@@ -127,6 +132,7 @@ def test_ghost_layer():
 def test_setting_value():
+    pytest.importorskip('cupy')
    arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
    arr_gpu = cp.asarray(arr_cpu)
@@ -143,6 +149,7 @@ def test_setting_value():
 def test_periodicity():
+    pytest.importorskip('cupy')
    from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu
    from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu
@@ -163,6 +170,7 @@ def test_periodicity():
 @pytest.mark.parametrize("device_number", device_numbers)
 def test_block_indexing(device_number):
+    pytest.importorskip('cupy')
    f = fields("f: [3D]")
    s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
@@ -195,6 +203,7 @@ def test_block_indexing(device_number):
 @pytest.mark.parametrize('layout', ("C", "F"))
 @pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
+    pytest.importorskip('cupy')
    n_elements = np.prod(shape)
    arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
@@ -210,3 +219,39 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):
    kernel(f=arr_gpu, value=np.float64(42.0))
    np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)
+@pytest.mark.parametrize('start', (1, 5))
+@pytest.mark.parametrize('end', (-1, -2, -3, -4))
+@pytest.mark.parametrize('step', (1, 2, 3, 4))
+@pytest.mark.parametrize('shape', ([55, 60], [77, 101, 80], [44, 64, 66]))
+def test_guards_with_iteration_slices(start, end, step, shape):
+    iter_slice = tuple([slice(start, end, step)] * len(shape))
+    kernel_config_gpu = CreateKernelConfig(target=Target.GPU, iteration_slice=iter_slice)
+    field_1 = fields(f"f(1) : double{list(shape)}")
+    assignment = Assignment(field_1.center, 1)
+    ast = create_kernel(assignment, config=kernel_config_gpu)
+    code_str = get_code_str(ast)
+    test_strings = list()
+    iteration_ranges = list()
+    for i, s in enumerate(iter_slice):
+        e = ((shape[i] + end) - s.start) / s.step
+        e = math.ceil(e) + s.start
+        test_strings.append(f"{s.start} < {e}")
+        a = s.start
+        counter = 0
+        while a < e:
+            a += 1
+            counter += 1
+        iteration_ranges.append(counter)
+    # check if the expected if statement is in the GPU code
+    for s in test_strings:
+        assert s in code_str
+    # check if these bounds lead to same lengths as the range function would produce
+    for i in range(len(iter_slice)):
+        assert iteration_ranges[i] == len(range(iter_slice[i].start, shape[i] + end, iter_slice[i].step))
--- a/tests/test_jupyter_extensions.ipynb
+++ b/tests/test_jupyter_extensions.ipynb
--- a/tests/test_math_functions.py
+++ b/tests/test_math_functions.py
@@ -39,7 +39,7 @@ def test_two_arguments(dtype, func, target):
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
-@pytest.mark.parametrize('func', [sp.sin, sp.cos, sp.sinh, sp.cosh, sp.atan])
+@pytest.mark.parametrize('func', [sp.sin, sp.cos, sp.sinh, sp.cosh, sp.atan, sp.floor, sp.ceiling])
 @pytest.mark.parametrize('target', [ps.Target.CPU, ps.Target.GPU])
 def test_single_arguments(dtype, func, target):
    if target == ps.Target.GPU:
@@ -58,7 +58,8 @@ def test_single_arguments(dtype, func, target):
    ast = ps.create_kernel(up, config=config)
    code = ps.get_code_str(ast)
    if dtype == 'float32':
-        assert func.__name__.lower() in code
+        func_name = func.__name__.lower() if func is not sp.ceiling else "ceil"
+        assert func_name in code
    kernel = ast.compile()
    dh.all_to_gpu()

--- a/tests/test_phasefield_dentritic_3D.ipynb
+++ b/tests/test_phasefield_dentritic_3D.ipynb
--- a/tests/test_random.py
+++ b/tests/test_random.py
@@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows':
 def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
    if target == Target.GPU:
        pytest.importorskip('cupy')
-    if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
+    if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
        pytest.xfail('AES not yet implemented for this architecture')
    if rng == 'aesni' and len(keys) == 2:
        keys *= 2

--- a/tests/test_vectorization.py
+++ b/tests/test_vectorization.py
@@ -143,10 +143,10 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
    # Without the base pointer spec, the inner store is not aligned
    config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
    ast = ps.create_kernel(update_rule, config=config)
-    if instruction_set in ['sse'] or instruction_set.startswith('avx'):
+    if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'):
        assert 'stream' in ast.instruction_set
        assert 'streamFence' in ast.instruction_set
-    if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'):
+    if instruction_set in ['neon', 'vsx', 'rvv']:
        assert 'cachelineZero' in ast.instruction_set
    if instruction_set in ['vsx']:
        assert 'storeAAndFlushCacheline' in ast.instruction_set
@@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set):
 def test_hardware_query():
-    assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
+    assert {'sse', 'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
 def test_vectorised_pow(instruction_set=instruction_set):

--- a/tests/test_vectorization_specific.py
+++ b/tests/test_vectorization_specific.py
@@ -60,22 +60,27 @@ def test_vectorized_abs(instruction_set, dtype):
 @pytest.mark.parametrize('dtype', ('float32', 'float64'))
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
-def test_strided(instruction_set, dtype):
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_strided(instruction_set, dtype, nontemporal):
    f, g = ps.fields(f"f, g : {dtype}[2D]")
    update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)]
+    config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                                      'nontemporal': nontemporal},
+                                                  default_number_float=dtype)
    if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \
            and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'):
        with pytest.warns(UserWarning) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
-                                                          default_number_float=dtype)
            ast = ps.create_kernel(update_rule, config=config)
            assert 'Could not vectorize loop' in warn[0].message.args[0]
    else:
        with pytest.warns(None) as warn:
-            config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
-                                                          default_number_float=dtype)
            ast = ps.create_kernel(update_rule, config=config)
            assert len(warn) == 0
+        instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    instruction = 'cachelineZero'
+    if instruction in ast.instruction_set:
+        assert ast.instruction_set[instruction] not in ps.get_code_str(ast)
    # ps.show_code(ast)
    func = ast.compile()
@@ -226,7 +231,7 @@ def test_issue62(dtype, instruction_set, padding):
    dy = sp.Symbol("dy")
    src, dst, rhs = ps.fields(f"src, src_tmp, rhs: {dtype}[2D]", layout='fzyx')
-    up = ps.Assignment(src[0, 0], ((dy ** 2 * (src[1, 0] + src[-1, 0]))
+    up = ps.Assignment(dst[0, 0], ((dy ** 2 * (src[1, 0] + src[-1, 0]))
                                   + (dx ** 2 * (src[0, 1] + src[0, -1]))
                                   - (rhs[0, 0] * dx ** 2 * dy ** 2)) / (2 * (dx ** 2 + dy ** 2)))
No results found