Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
/*
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
inline void cachelineZero(void * p) { inline void cachelineZero(void * p) {
#ifdef __riscv_zicboz #ifdef __riscv_zicboz
__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory"); __asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
......
...@@ -38,6 +38,7 @@ class KernelConstraintsCheck: ...@@ -38,6 +38,7 @@ class KernelConstraintsCheck:
def __init__(self, check_independence_condition=True, check_double_write_condition=True): def __init__(self, check_independence_condition=True, check_double_write_condition=True):
self.scopes = NestedScopes() self.scopes = NestedScopes()
self.field_reads = defaultdict(set)
self.field_writes = defaultdict(set) self.field_writes = defaultdict(set)
self.fields_read = set() self.fields_read = set()
self.check_independence_condition = check_independence_condition self.check_independence_condition = check_independence_condition
...@@ -111,6 +112,13 @@ class KernelConstraintsCheck: ...@@ -111,6 +112,13 @@ class KernelConstraintsCheck:
if self.check_double_write_condition and len(self.field_writes[fai]) > 1: if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
raise ValueError( raise ValueError(
f"Field {lhs.field.name} is written at two different locations") f"Field {lhs.field.name} is written at two different locations")
if fai in self.field_reads:
reads = tuple(self.field_reads[fai])
if len(reads) > 1 or lhs.offsets != reads[0]:
if self.check_independence_condition:
raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
f"This means the resulting kernel would not be thread safe")
elif isinstance(lhs, sp.Symbol): elif isinstance(lhs, sp.Symbol):
if self.scopes.is_defined_locally(lhs): if self.scopes.is_defined_locally(lhs):
raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}") raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
...@@ -120,8 +128,9 @@ class KernelConstraintsCheck: ...@@ -120,8 +128,9 @@ class KernelConstraintsCheck:
def update_accesses_rhs(self, rhs): def update_accesses_rhs(self, rhs):
if isinstance(rhs, Field.Access) and self.check_independence_condition: if isinstance(rhs, Field.Access) and self.check_independence_condition:
writes = self.field_writes[self.FieldAndIndex( fai = self.FieldAndIndex(rhs.field, rhs.index)
rhs.field, rhs.index)] writes = self.field_writes[fai]
self.field_reads[fai].add(rhs.offsets)
for write_offset in writes: for write_offset in writes:
assert len(writes) == 1 assert len(writes) == 1
if write_offset != rhs.offsets: if write_offset != rhs.offsets:
......
...@@ -128,6 +128,7 @@ def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelCon ...@@ -128,6 +128,7 @@ def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelCon
# --- check constrains # --- check constrains
check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check, check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
check_double_write_condition=not config.allow_double_writes) check_double_write_condition=not config.allow_double_writes)
check.visit(assignments) check.visit(assignments)
assignments.bound_fields = check.fields_written assignments.bound_fields = check.fields_written
......
...@@ -11,6 +11,7 @@ from sympy.core.relational import Relational ...@@ -11,6 +11,7 @@ from sympy.core.relational import Relational
from sympy.functions.elementary.piecewise import ExprCondPair from sympy.functions.elementary.piecewise import ExprCondPair
from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
from sympy.functions.elementary.hyperbolic import HyperbolicFunction from sympy.functions.elementary.hyperbolic import HyperbolicFunction
from sympy.functions.elementary.integers import RoundFunction
from sympy.logic.boolalg import BooleanFunction from sympy.logic.boolalg import BooleanFunction
from sympy.logic.boolalg import BooleanAtom from sympy.logic.boolalg import BooleanAtom
...@@ -213,7 +214,7 @@ class TypeAdder: ...@@ -213,7 +214,7 @@ class TypeAdder:
new_args.append(a) new_args.append(a)
return expr.func(*new_args) if new_args else expr, collated_type return expr.func(*new_args) if new_args else expr, collated_type
elif isinstance(expr, (sp.Pow, sp.exp, InverseTrigonometricFunction, TrigonometricFunction, elif isinstance(expr, (sp.Pow, sp.exp, InverseTrigonometricFunction, TrigonometricFunction,
HyperbolicFunction, sp.log)): HyperbolicFunction, sp.log, RoundFunction)):
args_types = [self.figure_out_type(arg) for arg in expr.args] args_types = [self.figure_out_type(arg) for arg in expr.args]
collated_type = collate_types([t for _, t in args_types]) collated_type = collate_types([t for _, t in args_types])
new_args = [a if t.dtype_eq(collated_type) else CastFunc(a, collated_type) for a, t in args_types] new_args = [a if t.dtype_eq(collated_type) else CastFunc(a, collated_type) for a, t in args_types]
......
...@@ -7,7 +7,7 @@ import sympy as sp ...@@ -7,7 +7,7 @@ import sympy as sp
def is_supported_type(dtype: np.dtype): def is_supported_type(dtype: np.dtype):
scalar = dtype.type scalar = dtype.type
c = np.issctype(dtype) c = np.issubdtype(dtype, np.generic)
subclass = issubclass(scalar, np.floating) or issubclass(scalar, np.integer) or issubclass(scalar, np.bool_) subclass = issubclass(scalar, np.floating) or issubclass(scalar, np.integer) or issubclass(scalar, np.bool_)
additional_checks = dtype.fields is None and dtype.hasobject is False and dtype.subdtype is None additional_checks = dtype.fields is None and dtype.hasobject is False and dtype.subdtype is None
return c and subclass and additional_checks return c and subclass and additional_checks
......
...@@ -82,8 +82,8 @@ def boolean_array_bounding_box(boolean_array): ...@@ -82,8 +82,8 @@ def boolean_array_bounding_box(boolean_array):
>>> a = np.zeros((4, 4), dtype=bool) >>> a = np.zeros((4, 4), dtype=bool)
>>> a[1:-1, 1:-1] = True >>> a[1:-1, 1:-1] = True
>>> boolean_array_bounding_box(a) >>> boolean_array_bounding_box(a) == [(1, 3), (1, 3)]
[(1, 3), (1, 3)] True
""" """
dim = boolean_array.ndim dim = boolean_array.ndim
shape = boolean_array.shape shape = boolean_array.shape
......
...@@ -3,6 +3,7 @@ import sympy as sp ...@@ -3,6 +3,7 @@ import sympy as sp
import pytest import pytest
import pystencils as ps import pystencils as ps
from pystencils.alignedarray import aligned_zeros
from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.astnodes import Block, Conditional, SympyAssignment
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.enums import Target from pystencils.enums import Target
...@@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i ...@@ -15,7 +16,7 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i
@pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('dtype', ('float32', 'float64'))
def test_vec_any(instruction_set, dtype): def test_vec_any(instruction_set, dtype):
if instruction_set in ['sve', 'sme', 'rvv']: if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
width = 4 # we don't know the actual value width = 4 # we don't know the actual value
else: else:
width = get_vector_instruction_set(dtype, instruction_set)['width'] width = get_vector_instruction_set(dtype, instruction_set)['width']
...@@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype): ...@@ -34,7 +35,7 @@ def test_vec_any(instruction_set, dtype):
cpu_vectorize_info={'instruction_set': instruction_set}) cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile() kernel = ast.compile()
kernel(data=data_arr) kernel(data=data_arr)
if instruction_set in ['sve', 'sme', 'rvv']: if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
# we only know that the first value has changed # we only know that the first value has changed
np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0) np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
else: else:
...@@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype): ...@@ -44,7 +45,7 @@ def test_vec_any(instruction_set, dtype):
@pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('dtype', ('float32', 'float64'))
def test_vec_all(instruction_set, dtype): def test_vec_all(instruction_set, dtype):
if instruction_set in ['sve', 'sme', 'rvv']: if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
width = 1000 # we don't know the actual value, need something guaranteed larger than vector width = 1000 # we don't know the actual value, need something guaranteed larger than vector
else: else:
width = get_vector_instruction_set(dtype, instruction_set)['width'] width = get_vector_instruction_set(dtype, instruction_set)['width']
...@@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype): ...@@ -59,7 +60,7 @@ def test_vec_all(instruction_set, dtype):
cpu_vectorize_info={'instruction_set': instruction_set}) cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile() kernel = ast.compile()
kernel(data=data_arr) kernel(data=data_arr)
if instruction_set in ['sve', 'sme', 'rvv']: if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
# we only know that some values in the middle have been replaced # we only know that some values in the middle have been replaced
assert np.all(data_arr[3:9, :2] <= 1.0) assert np.all(data_arr[3:9, :2] <= 1.0)
assert np.any(data_arr[3:9, 2:] == 2.0) assert np.any(data_arr[3:9, 2:] == 2.0)
...@@ -94,16 +95,60 @@ def test_boolean_before_loop(): ...@@ -94,16 +95,60 @@ def test_boolean_before_loop():
@pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('dtype', ('float32', 'float64'))
def test_vec_maskstore(instruction_set, dtype): @pytest.mark.parametrize('nontemporal', [False, True])
data_arr = np.zeros((16, 16), dtype=dtype) @pytest.mark.parametrize('aligned', [False, True])
def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
data_arr[3:-3, 3:-3] = 1.0 data_arr[3:-3, 3:-3] = 1.0
data = ps.fields(f"data: {dtype}[2D]", data=data_arr) data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
assignmets = NodeCollection(c) assignmets = NodeCollection(c)
config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, default_number_float=dtype) config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
'nontemporal': nontemporal,
'assume_aligned': aligned},
default_number_float=dtype)
ast = ps.create_kernel(assignmets, config=config) ast = ps.create_kernel(assignmets, config=config)
if 'maskStore' in ast.instruction_set:
instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
print(ps.get_code_str(ast))
kernel = ast.compile()
kernel(data=data_arr)
np.testing.assert_equal(data_arr[:3, :], 2.0)
np.testing.assert_equal(data_arr[-3:, :], 2.0)
np.testing.assert_equal(data_arr[:, :3], 2.0)
np.testing.assert_equal(data_arr[:, -3:], 2.0)
np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
@pytest.mark.parametrize('dtype', ('float32', 'float64'))
@pytest.mark.parametrize('nontemporal', [False, True])
def test_vec_maskscatter(instruction_set, dtype, nontemporal):
data_arr = np.zeros((16, 16), dtype=dtype)
data_arr[3:-3, 3:-3] = 1.0
data = ps.fields(f"data: {dtype}[2D]")
c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
assignmets = NodeCollection(c)
config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
'nontemporal': nontemporal},
default_number_float=dtype)
if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
and not instruction_set.startswith('sve'):
with pytest.warns(UserWarning) as warn:
ast = ps.create_kernel(assignmets, config=config)
assert 'Could not vectorize loop' in warn[0].message.args[0]
else:
with pytest.warns(None) as warn:
ast = ps.create_kernel(assignmets, config=config)
assert len(warn) == 0
instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
print(ps.get_code_str(ast)) print(ps.get_code_str(ast))
kernel = ast.compile() kernel = ast.compile()
kernel(data=data_arr) kernel(data=data_arr)
......
import pytest import pytest
import numpy as np import numpy as np
import cupy as cp
import sympy as sp import sympy as sp
import math
from scipy.ndimage import convolve from scipy.ndimage import convolve
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target, get_code_str
from pystencils.gpu import BlockIndexing from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
try: try:
import cupy import cupy as cp
device_numbers = range(cupy.cuda.runtime.getDeviceCount()) device_numbers = range(cp.cuda.runtime.getDeviceCount())
except ImportError: except ImportError:
device_numbers = [] device_numbers = []
cp = None
def test_averaging_kernel(): def test_averaging_kernel():
pytest.importorskip('cupy')
size = (40, 55) size = (40, 55)
src_arr = np.random.rand(*size) src_arr = np.random.rand(*size)
src_arr = add_ghost_layers(src_arr) src_arr = add_ghost_layers(src_arr)
...@@ -44,6 +46,7 @@ def test_averaging_kernel(): ...@@ -44,6 +46,7 @@ def test_averaging_kernel():
def test_variable_sized_fields(): def test_variable_sized_fields():
pytest.importorskip('cupy')
src_field = Field.create_generic('src', spatial_dimensions=2) src_field = Field.create_generic('src', spatial_dimensions=2)
dst_field = Field.create_generic('dst', spatial_dimensions=2) dst_field = Field.create_generic('dst', spatial_dimensions=2)
...@@ -71,6 +74,7 @@ def test_variable_sized_fields(): ...@@ -71,6 +74,7 @@ def test_variable_sized_fields():
def test_multiple_index_dimensions(): def test_multiple_index_dimensions():
pytest.importorskip('cupy')
"""Sums along the last axis of a numpy array""" """Sums along the last axis of a numpy array"""
src_size = (7, 6, 4) src_size = (7, 6, 4)
dst_size = src_size[:2] dst_size = src_size[:2]
...@@ -103,6 +107,7 @@ def test_multiple_index_dimensions(): ...@@ -103,6 +107,7 @@ def test_multiple_index_dimensions():
def test_ghost_layer(): def test_ghost_layer():
pytest.importorskip('cupy')
size = (6, 5) size = (6, 5)
src_arr = np.ones(size) src_arr = np.ones(size)
dst_arr = np.zeros_like(src_arr) dst_arr = np.zeros_like(src_arr)
...@@ -127,6 +132,7 @@ def test_ghost_layer(): ...@@ -127,6 +132,7 @@ def test_ghost_layer():
def test_setting_value(): def test_setting_value():
pytest.importorskip('cupy')
arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5) arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
arr_gpu = cp.asarray(arr_cpu) arr_gpu = cp.asarray(arr_cpu)
...@@ -143,6 +149,7 @@ def test_setting_value(): ...@@ -143,6 +149,7 @@ def test_setting_value():
def test_periodicity(): def test_periodicity():
pytest.importorskip('cupy')
from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu
from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu
...@@ -163,6 +170,7 @@ def test_periodicity(): ...@@ -163,6 +170,7 @@ def test_periodicity():
@pytest.mark.parametrize("device_number", device_numbers) @pytest.mark.parametrize("device_number", device_numbers)
def test_block_indexing(device_number): def test_block_indexing(device_number):
pytest.importorskip('cupy')
f = fields("f: [3D]") f = fields("f: [3D]")
s = normalize_slice(make_slice[:, :, :], f.spatial_shape) s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2), bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
...@@ -195,6 +203,7 @@ def test_block_indexing(device_number): ...@@ -195,6 +203,7 @@ def test_block_indexing(device_number):
@pytest.mark.parametrize('layout', ("C", "F")) @pytest.mark.parametrize('layout', ("C", "F"))
@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11))) @pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
def test_four_dimensional_kernel(gpu_indexing, layout, shape): def test_four_dimensional_kernel(gpu_indexing, layout, shape):
pytest.importorskip('cupy')
n_elements = np.prod(shape) n_elements = np.prod(shape)
arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout) arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
...@@ -210,3 +219,39 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape): ...@@ -210,3 +219,39 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):
kernel(f=arr_gpu, value=np.float64(42.0)) kernel(f=arr_gpu, value=np.float64(42.0))
np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0) np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)
@pytest.mark.parametrize('start', (1, 5))
@pytest.mark.parametrize('end', (-1, -2, -3, -4))
@pytest.mark.parametrize('step', (1, 2, 3, 4))
@pytest.mark.parametrize('shape', ([55, 60], [77, 101, 80], [44, 64, 66]))
def test_guards_with_iteration_slices(start, end, step, shape):
iter_slice = tuple([slice(start, end, step)] * len(shape))
kernel_config_gpu = CreateKernelConfig(target=Target.GPU, iteration_slice=iter_slice)
field_1 = fields(f"f(1) : double{list(shape)}")
assignment = Assignment(field_1.center, 1)
ast = create_kernel(assignment, config=kernel_config_gpu)
code_str = get_code_str(ast)
test_strings = list()
iteration_ranges = list()
for i, s in enumerate(iter_slice):
e = ((shape[i] + end) - s.start) / s.step
e = math.ceil(e) + s.start
test_strings.append(f"{s.start} < {e}")
a = s.start
counter = 0
while a < e:
a += 1
counter += 1
iteration_ranges.append(counter)
# check if the expected if statement is in the GPU code
for s in test_strings:
assert s in code_str
# check if these bounds lead to same lengths as the range function would produce
for i in range(len(iter_slice)):
assert iteration_ranges[i] == len(range(iter_slice[i].start, shape[i] + end, iter_slice[i].step))
This diff is collapsed.
...@@ -39,7 +39,7 @@ def test_two_arguments(dtype, func, target): ...@@ -39,7 +39,7 @@ def test_two_arguments(dtype, func, target):
@pytest.mark.parametrize('dtype', ["float64", "float32"]) @pytest.mark.parametrize('dtype', ["float64", "float32"])
@pytest.mark.parametrize('func', [sp.sin, sp.cos, sp.sinh, sp.cosh, sp.atan]) @pytest.mark.parametrize('func', [sp.sin, sp.cos, sp.sinh, sp.cosh, sp.atan, sp.floor, sp.ceiling])
@pytest.mark.parametrize('target', [ps.Target.CPU, ps.Target.GPU]) @pytest.mark.parametrize('target', [ps.Target.CPU, ps.Target.GPU])
def test_single_arguments(dtype, func, target): def test_single_arguments(dtype, func, target):
if target == ps.Target.GPU: if target == ps.Target.GPU:
...@@ -58,7 +58,8 @@ def test_single_arguments(dtype, func, target): ...@@ -58,7 +58,8 @@ def test_single_arguments(dtype, func, target):
ast = ps.create_kernel(up, config=config) ast = ps.create_kernel(up, config=config)
code = ps.get_code_str(ast) code = ps.get_code_str(ast)
if dtype == 'float32': if dtype == 'float32':
assert func.__name__.lower() in code func_name = func.__name__.lower() if func is not sp.ceiling else "ceil"
assert func_name in code
kernel = ast.compile() kernel = ast.compile()
dh.all_to_gpu() dh.all_to_gpu()
......
This diff is collapsed.
...@@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows': ...@@ -32,7 +32,7 @@ if get_compiler_config()['os'] == 'windows':
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
if target == Target.GPU: if target == Target.GPU:
pytest.importorskip('cupy') pytest.importorskip('cupy')
if instruction_sets and {'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
pytest.xfail('AES not yet implemented for this architecture') pytest.xfail('AES not yet implemented for this architecture')
if rng == 'aesni' and len(keys) == 2: if rng == 'aesni' and len(keys) == 2:
keys *= 2 keys *= 2
......
...@@ -143,10 +143,10 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set): ...@@ -143,10 +143,10 @@ def test_aligned_and_nt_stores(openmp, instruction_set=instruction_set):
# Without the base pointer spec, the inner store is not aligned # Without the base pointer spec, the inner store is not aligned
config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp) config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
ast = ps.create_kernel(update_rule, config=config) ast = ps.create_kernel(update_rule, config=config)
if instruction_set in ['sse'] or instruction_set.startswith('avx'): if instruction_set in ['sse'] or instruction_set.startswith('avx') or instruction_set.startswith('sve'):
assert 'stream' in ast.instruction_set assert 'stream' in ast.instruction_set
assert 'streamFence' in ast.instruction_set assert 'streamFence' in ast.instruction_set
if instruction_set in ['neon', 'sme', 'vsx', 'rvv'] or instruction_set.startswith('sve'): if instruction_set in ['neon', 'vsx', 'rvv']:
assert 'cachelineZero' in ast.instruction_set assert 'cachelineZero' in ast.instruction_set
if instruction_set in ['vsx']: if instruction_set in ['vsx']:
assert 'storeAAndFlushCacheline' in ast.instruction_set assert 'storeAAndFlushCacheline' in ast.instruction_set
...@@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set): ...@@ -331,7 +331,7 @@ def test_logical_operators(instruction_set=instruction_set):
def test_hardware_query(): def test_hardware_query():
assert {'sse', 'neon', 'sve', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets) assert {'sse', 'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(supported_instruction_sets)
def test_vectorised_pow(instruction_set=instruction_set): def test_vectorised_pow(instruction_set=instruction_set):
......
...@@ -60,22 +60,27 @@ def test_vectorized_abs(instruction_set, dtype): ...@@ -60,22 +60,27 @@ def test_vectorized_abs(instruction_set, dtype):
@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('dtype', ('float32', 'float64'))
@pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
def test_strided(instruction_set, dtype): @pytest.mark.parametrize('nontemporal', [False, True])
def test_strided(instruction_set, dtype, nontemporal):
f, g = ps.fields(f"f, g : {dtype}[2D]") f, g = ps.fields(f"f, g : {dtype}[2D]")
update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)]
config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
'nontemporal': nontemporal},
default_number_float=dtype)
if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \ if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \
and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'): and instruction_set not in ['avx512', 'avx512vl', 'rvv'] and not instruction_set.startswith('sve'):
with pytest.warns(UserWarning) as warn: with pytest.warns(UserWarning) as warn:
config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
default_number_float=dtype)
ast = ps.create_kernel(update_rule, config=config) ast = ps.create_kernel(update_rule, config=config)
assert 'Could not vectorize loop' in warn[0].message.args[0] assert 'Could not vectorize loop' in warn[0].message.args[0]
else: else:
with pytest.warns(None) as warn: with pytest.warns(None) as warn:
config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set},
default_number_float=dtype)
ast = ps.create_kernel(update_rule, config=config) ast = ps.create_kernel(update_rule, config=config)
assert len(warn) == 0 assert len(warn) == 0
instruction = 'streamS' if nontemporal and 'streamS' in ast.instruction_set else 'storeS'
assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
instruction = 'cachelineZero'
if instruction in ast.instruction_set:
assert ast.instruction_set[instruction] not in ps.get_code_str(ast)
# ps.show_code(ast) # ps.show_code(ast)
func = ast.compile() func = ast.compile()
...@@ -226,7 +231,7 @@ def test_issue62(dtype, instruction_set, padding): ...@@ -226,7 +231,7 @@ def test_issue62(dtype, instruction_set, padding):
dy = sp.Symbol("dy") dy = sp.Symbol("dy")
src, dst, rhs = ps.fields(f"src, src_tmp, rhs: {dtype}[2D]", layout='fzyx') src, dst, rhs = ps.fields(f"src, src_tmp, rhs: {dtype}[2D]", layout='fzyx')
up = ps.Assignment(src[0, 0], ((dy ** 2 * (src[1, 0] + src[-1, 0])) up = ps.Assignment(dst[0, 0], ((dy ** 2 * (src[1, 0] + src[-1, 0]))
+ (dx ** 2 * (src[0, 1] + src[0, -1])) + (dx ** 2 * (src[0, 1] + src[0, -1]))
- (rhs[0, 0] * dx ** 2 * dy ** 2)) / (2 * (dx ** 2 + dy ** 2))) - (rhs[0, 0] * dx ** 2 * dy ** 2)) / (2 * (dx ** 2 + dy ** 2)))
......