Compare revisions

Alexander Reinauer · Frederik Hennig · Frederik Hennig · Michael Kuron · Markus Holzer · Frederik Hennig
--- a/.gitattributes
+++ b/.gitattributes
-pystencils/_version.py export-subst
+src/pystencils/_version.py export-subst
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 stages:
  - pretest
  - test
+  - nightly
+  - docs
  - deploy


+# --------------------------  Templates ------------------------------------------------------------------------------------
+
+# Base configuration for jobs meant to run at every commit
+.every-commit:
+  rules:
+    - if: $CI_PIPELINE_SOURCE != "schedule"
+
+# Configuration for jobs meant to run on each commit to pycodegen/pystencils/master
+.every-commit-master:
+  rules:
+    - if: '$CI_PIPELINE_SOURCE != "schedule" && $CI_PROJECT_PATH == "pycodegen/pystencils" && $CI_COMMIT_BRANCH == "master"'
+
+# Base configuration for jobs meant to run at a schedule
+.scheduled:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+
 # --------------------------  Tests ------------------------------------------------------------------------------------

 # Normal test - runs on every commit all but "long run" tests
 tests-and-coverage:
  stage: pretest
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  extends: .every-commit
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
  before_script:
    - pip install -e .
  script:
@@ -45,7 +62,7 @@ tests-and-coverage-with-longrun:
  stage: test
  when: manual
  allow_failure: true
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
  before_script:
    - pip install sympy --upgrade
    - pip install -e .
@@ -65,9 +82,7 @@ tests-and-coverage-with-longrun:
 # pipeline with latest python version
 latest-python:
  stage: test
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/latest_python
  before_script:
    - pip install -e .
@@ -92,9 +107,6 @@ latest-python:
 # Minimal tests in windows environment
 #minimal-windows:
 #  stage: test
-#  except:
-#    variables:
-#      - $ENABLE_NIGHTLY_BUILDS
 #  tags:
 #    - win
 #  script:
@@ -108,9 +120,7 @@ latest-python:

 ubuntu:
  stage: test
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ubuntu
  before_script:
    - ln -s /usr/include/locale.h /usr/include/xlocale.h
@@ -134,9 +144,8 @@ ubuntu:

 .multiarch_template:
  stage: test
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
+  allow_failure: true
  before_script: &multiarch_before_script
    # - pip3 install -v .
    - export PYTHONPATH=src
@@ -149,13 +158,19 @@ ubuntu:
    - sed -i 's/--doctest-modules //g' pytest.ini
    - env
    - pip3 list
-    - python3 -m pytest -v -n $NUM_CORES --junitxml=report.xml tests/test_*vec*.py tests/test_random.py tests/test_half_precision.py
+    - python3 -m pytest -v -n $NUM_CORES --reruns 2 --cov-report html --cov-report xml --cov=. --junitxml=report.xml tests/test_*vec*.py tests/test_random.py tests/test_half_precision.py
+    - python3 -m coverage xml
  tags:
    - docker
-    - AVX
+    - multiarch
  artifacts:
    when: always
+    paths:
+      - coverage_report
    reports:
+      coverage_report:
+        coverage_format: cobertura
+        path: coverage.xml
      junit: report.xml

 arm64v8:
@@ -165,45 +180,41 @@ arm64v8:
    QEMU_CPU: "cortex-a76"
  before_script:
    - *multiarch_before_script
-    - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json

 ppc64le:
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
  before_script:
    - *multiarch_before_script
-    - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json

 arm64v9:
-  # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
+  # SVE support is still unreliable in GCC 13 (incorrect code for fixed-width vectors, internal compiler errors).
  # For half precision Clang is necessary
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
+  variables:
+    # disable pointer authentication to speed up emulation by 3x
+    QEMU_CPU: "max,pauth-impdef=on"
  before_script:
    - *multiarch_before_script
-    - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
    - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json

 riscv64:
-  # RISC-V vector extension are currently not supported by GCC.
-  # Also, the image is built without the libomp package which is not yet available on Ubuntu.
+  # RISC-V vector extension support is incomplete in GCC 13.
  extends: .multiarch_template
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
  variables:
-    # explicitly set SIMD as detection does not appear to work on QEMU
+    # explicitly set SIMD as detection requires QEMU >= 8.1
    PYSTENCILS_SIMD: "rvv"
-    QEMU_CPU: "rv64,v=true"
+    QEMU_CPU: "rv64,v=true,zicboz=true"
  before_script:
    - *multiarch_before_script
-    - sed -i 's/march=native/march=rv64imfdv/g' ~/.config/pystencils/config.json
+    - sed -i 's/march=native/march=rv64imfdvzicboz/g' ~/.config/pystencils/config.json
    - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json
-    - sed -i 's/fopenmp/fopenmp=libgomp -I\/usr\/include\/riscv64-linux-gnu/g' ~/.config/pystencils/config.json

 minimal-conda:
  stage: pretest
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/minimal_conda
  before_script:
    - pip install -e .
@@ -216,9 +227,7 @@ minimal-conda:

 minimal-sympy-master:
  stage: test
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/minimal_conda
  before_script:
    - pip install -e .
@@ -275,14 +284,42 @@ pycodegen-integration:
    reports:
      junit: pycodegen/*/report.xml

+
+# -------------------- Scheduled Tasks --------------------------------------------------------------------------
+
+
+# Nightly test against the latest (pre-release) version of SymPy published on PyPI
+nightly-sympy:
+  stage: nightly
+  needs: []
+  extends: .scheduled
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/latest_python
+  before_script:
+    - pip install -e .
+    - pip install --upgrade --pre sympy
+  script:
+    - env
+    - pip list
+    - export NUM_CORES=$(nproc --all)
+    - mkdir -p ~/.config/matplotlib
+    - echo "backend:template" > ~/.config/matplotlib/matplotlibrc
+    - mkdir public
+    - pytest -v -n $NUM_CORES -m "not longrun" --junitxml=report.xml
+  tags:
+    - docker
+    - AVX
+    - cuda
+  artifacts:
+    when: always
+    reports:
+      junit: report.xml
+
 # -------------------- Linter & Documentation --------------------------------------------------------------------------


 flake8-lint:
  stage: pretest
-  except:
-    variables:
-      - $ENABLE_NIGHTLY_BUILDS
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
  script:
    - flake8 src/pystencils
@@ -291,8 +328,10 @@ flake8-lint:


 build-documentation:
-  stage: test
+  stage: docs
+  extends: .every-commit
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/documentation
+  needs: []
  before_script:
    - pip install -e .
  script:
@@ -308,7 +347,9 @@ build-documentation:

 pages:
  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  extends: .every-commit-master
  stage: deploy
+  needs: ["tests-and-coverage", "build-documentation"]
  script:
    - ls -l
    - mv coverage_report html_doc
@@ -318,5 +359,3 @@ pages:
      - public
  tags:
    - docker
-  only:
-    - master@pycodegen/pystencils
--- a/doc/notebooks/02_tutorial_basic_kernels.ipynb
+++ b/doc/notebooks/02_tutorial_basic_kernels.ipynb
--- a/doc/notebooks/06_tutorial_phasefield_dentritic_growth.ipynb
+++ b/doc/notebooks/06_tutorial_phasefield_dentritic_growth.ipynb
--- a/doc/notebooks/demo_assignment_collection.ipynb
+++ b/doc/notebooks/demo_assignment_collection.ipynb
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 ]
 license = { file = "COPYING.txt" }
 requires-python = ">=3.10"
-dependencies = ["sympy>=1.6,<=1.11.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml"]
+dependencies = ["sympy>=1.9,<=1.12.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml", "fasteners"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Framework :: Jupyter",
@@ -70,8 +70,7 @@ tests = [
 [build-system]
 requires = [
    "setuptools>=61",
-    "versioneer>=0.29",
-    "tomli; python_version < '3.11'",
+    "versioneer[toml]>=0.29",
    # 'Cython'
 ]
 build-backend = "setuptools.build_meta"

--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -36,7 +36,5 @@ __all__ = ['Field', 'FieldType', 'fields',
           'fd',
           'stencil']

-from ._version import get_versions
-
-__version__ = get_versions()['version']
-del get_versions
+from . import _version
+__version__ = _version.get_versions()['version']
--- a/src/pystencils/_version.py
+++ b/src/pystencils/_version.py
--- a/src/pystencils/astnodes.py
+++ b/src/pystencils/astnodes.py
@@ -5,12 +5,12 @@ from typing import Any, List, Optional, Sequence, Set, Union

 import sympy as sp

-import pystencils
-from pystencils.typing.utilities import create_type, get_next_parent_of_type
+from pystencils.assignment import Assignment
 from pystencils.enums import Target, Backend
 from pystencils.field import Field
-from pystencils.typing.typed_sympy import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol
 from pystencils.sympyextensions import fast_subs
+from pystencils.typing import (create_type, get_next_parent_of_type,
+                               FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol, CFunction)

 NodeOrExpr = Union['Node', sp.Expr]

@@ -270,6 +270,9 @@ class KernelFunction(Node):
        parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
        if hasattr(self, 'indexing'):
            parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
+        # Exclude paramters of type CFunction. These parameters will result in a C function call that will be handled
+        # by including a respective header file in the compute kernel. Hence, it is not a free parameter.
+        parameters = [p for p in parameters if not isinstance(p.symbol, CFunction)]
        parameters.sort(key=lambda p: p.symbol.name)
        return parameters

@@ -387,7 +390,7 @@ class Block(Node):
    def symbols_defined(self):
        result = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
            else:
                result.update(a.symbols_defined)
@@ -398,7 +401,7 @@ class Block(Node):
        result = set()
        defined_symbols = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
                defined_symbols.update({a.lhs})
            else:

--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
+from pystencils.typing import CFunction
+
+
 def get_argument_string(function_shortcut, first=''):
    args = function_shortcut[function_shortcut.index('[') + 1: -1]
    arg_string = "("
@@ -16,10 +19,13 @@ def get_argument_string(function_shortcut, first=''):


 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
-    if instruction_set != 'neon' and not instruction_set.startswith('sve'):
+    if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
        raise NotImplementedError(instruction_set)
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        cmp = 'cmp'
+    elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
        cmp = 'cmp'
+        bitwidth = int(instruction_set[4:])
    elif instruction_set.startswith('sve'):
        cmp = 'cmp'
        bitwidth = int(instruction_set[3:])
@@ -52,7 +58,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):

    result = dict()

-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
        width = 'svcntd()' if data_type == 'double' else 'svcntw()'
        intwidth = 'svcntw()'
        result['bytes'] = 'svcntb()'
@@ -60,14 +66,15 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        width = bitwidth // bits[data_type]
        intwidth = bitwidth // bits['int']
        result['bytes'] = bitwidth // 8
-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
+        base_names['stream'] = 'stnt1[0, 1]'
        prefix = 'sv'
-        suffix = f'_f{bits[data_type]}' 
+        suffix = f'_f{bits[data_type]}'
    elif instruction_set == 'neon':
        prefix = 'v'
-        suffix = f'q_f{bits[data_type]}' 
+        suffix = f'q_f{bits[data_type]}'

-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
        predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
        int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
    else:
@@ -86,33 +93,36 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):

        result[intrinsic_id] = prefix + name + suffix + undef + arg_string

-    if instruction_set == 'sve':
-        from pystencils.backends.cbackend import CFunction
+    if instruction_set in ['sve', 'sve2', 'sme']:
        result['width'] = CFunction(width, "int")
        result['intwidth'] = CFunction(intwidth, "int")
    else:
        result['width'] = width
        result['intwidth'] = intwidth

-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
        result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
        result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
        result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'

-        vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
-        result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-                           vindex.format("{2}") + ', {1})'
-        result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-                          vindex.format("{1}") + ')'
+        if instruction_set != 'sme':
+            vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
+            result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                               vindex.format("{2}") + ', {1})'
+            result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                              vindex.format("{1}") + ')'
+        if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                                vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'

        result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"

-        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set != "sve" else ""}t'
-        result['bool'] = f'svbool_{"s" if instruction_set != "sve" else ""}t'
+        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'

-        result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"']
+        result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']

        result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
        result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
@@ -121,9 +131,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'

        result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
-        result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+        result['maskStream'] = result['stream'].replace(predicate, '{2}')
+        if instruction_set != 'sme':
+            result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+            if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+                result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')

-        if instruction_set != 'sve':
+        result['streamFence'] = '__dmb(15)'
+
+        if instruction_set == 'sme':
+            result['function_prefix'] = '__arm_locally_streaming'
+        elif instruction_set not in ['sve', 'sve2', 'sme']:
            result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
    else:
        result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
@@ -148,7 +166,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
        result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'

+        # SVE has real nontemporal stores, so we only need to zero cachlines on Neon
+        result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
    result['cachelineSize'] = 'cachelineSize()'
-    result['cachelineZero'] = 'cachelineZero((void*) {0})'

    return result
--- a/src/pystencils/backends/cbackend.py
+++ b/src/pystencils/backends/cbackend.py
@@ -6,7 +6,6 @@ from typing import Set
 import numpy as np
 import sympy as sp
 from sympy.core import S
-from sympy.core.cache import cacheit
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
 from sympy.functions.elementary.hyperbolic import HyperbolicFunction
@@ -15,7 +14,7 @@ from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
 from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
 from pystencils.typing import (
    PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
-    ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol)
+    ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
 from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.functions import DivFunc, AddressOf
@@ -166,23 +165,6 @@ class PrintNode(CustomCodeNode):
        self.headers.append("<iostream>")


-class CFunction(TypedSymbol):
-    def __new__(cls, function, dtype):
-        return CFunction.__xnew_cached_(cls, function, dtype)
-
-    def __new_stage2__(cls, function, dtype):
-        return super(CFunction, cls).__xnew__(cls, function, dtype)
-
-    __xnew__ = staticmethod(__new_stage2__)
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    def __getnewargs__(self):
-        return self.name, self.dtype
-
-    def __getnewargs_ex__(self):
-        return (self.name, self.dtype), {}
-
-
 # ------------------------------------------- Printer ------------------------------------------------------------------


@@ -280,14 +262,25 @@ class CBackend:
            if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
                arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
                instr = 'storeU'
-                if aligned:
+                if nontemporal and 'storeA' not in self._vector_instruction_set and \
+                        'stream' in self._vector_instruction_set:
+                    instr = 'stream'
+                elif aligned:
                    instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                if mask != True:  # NOQA
-                    instr = 'maskStoreA' if aligned else 'maskStoreU'
+                    instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
+                            'maskStoreA' if aligned else 'maskStoreU'
                    if instr not in self._vector_instruction_set:
-                        self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format(
+                        if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
+                            store, load = 'stream', 'loadA'
+                        elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
+                            store, load = 'storeA', 'loadA'
+                        else:
+                            store, load = 'storeU', 'loadU'
+                        load = load if load in self._vector_instruction_set else 'loadU'
+                        self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
                            '{0}', self._vector_instruction_set['blendv'].format(
-                                self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs),
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
                                '{1}', '{2}', **self._kwargs), **self._kwargs)
                    printed_mask = self.sympy_printer.doprint(mask)
                    if data_type.base_type.c_name == 'double':
@@ -312,12 +305,14 @@ class CBackend:
                ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])

                if stride != 1:
-                    instr = 'maskStoreS' if mask != True else 'storeS'  # NOQA
+                    instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
+                             'maskStoreS') if mask != True else \
+                            ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS')  # NOQA
                    return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                      stride, printed_mask, **self._kwargs) + ';'

                pre_code = ''
-                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True:  # NOQA
                    first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
                    offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
                                      * node.lhs.args[0].field.spatial_strides[i] for i in
@@ -337,15 +332,22 @@ class CBackend:
                    code2 = self._vector_instruction_set['flushCacheline'].format(
                        ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
                    code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
-                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
                    lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
                    rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
                    tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
                    code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
                        + self.sympy_printer.doprint(rhs) + ';'
                    code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
-                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask,
-                                                                                           **self._kwargs) + ';'
+                    maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
+                    instr2 = maskStore if mask != True else store  # NOQA
+                    if instr2 not in self._vector_instruction_set:
+                        self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs),
+                            **self._kwargs)
+                    code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
                    code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
                return pre_code + code
            else:
@@ -614,7 +616,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
            return None

    def _print_Abs(self, expr):
-        if 'abs' in self.instruction_set and isinstance(expr.args[0], VectorMemoryAccess):
+        if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
            return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
        return super()._print_Abs(expr)


--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
+from pystencils.typing import CFunction
+
+
 def get_argument_string(function_shortcut, last=''):
    args = function_shortcut[function_shortcut.index('[') + 1: -1]
    arg_string = "("
@@ -34,7 +37,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
        'loadS': f'lse{bits[data_type]}_v[0, 1]',
        'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
-        'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',

        'abs': 'fabs_v[0]',
        '==': 'mfeq_vv[0, 1]',
@@ -46,17 +49,17 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        '&': 'mand_mm[0, 1]',
        '|': 'mor_mm[0, 1]',

-        'blendv': 'merge_vvm[2, 0, 1]',
+        'blendv': 'merge_vvm[0, 1, 2]',
        'any': 'cpop_m[0]',
        'all': 'cpop_m[0]',
    }

    result = dict()

-    width = f'vsetvlmax_e{bits[data_type]}m1()'
-    intwidth = 'vsetvlmax_e{bits["int"]}m1()'
-    result['bytes'] = 'vsetvlmax_e8m1()'
-    prefix = 'v'
+    prefix = '__riscv_v'
+    width = f'{prefix}setvlmax_e{bits[data_type]}m1()'
+    intwidth = f'{prefix}setvlmax_e{bits["int"]}m1()'
+    result['bytes'] = f'{prefix}setvlmax_e8m1()'
    suffix = f'_f{bits[data_type]}m1'

    vl = '{loop_stop} - {loop_counter}'
@@ -78,29 +81,31 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):

        result[intrinsic_id] = prefix + name + suffix2 + arg_string

-    from pystencils.backends.cbackend import CFunction
    result['width'] = CFunction(width, "int")
    result['intwidth'] = CFunction(intwidth, "int")

-    result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
-    result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
-    result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
-                             f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
+    result['makeVecConst'] = f'{prefix}fmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
+    result['makeVecConstInt'] = f'{prefix}mv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
+    result['makeVecIndex'] = f'{prefix}macc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
+                             f'{prefix}id_v_i{bits["int"]}m1({int_vl}), {int_vl})'

    result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
    result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
-    result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')

-    result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
+    result['+int'] = f"{prefix}add_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"

    result['float'] = f'vfloat{bits["float"]}m1_t'
    result['double'] = f'vfloat{bits["double"]}m1_t'
    result['int'] = f'vint{bits["int"]}m1_t'
    result['bool'] = f'vbool{bits[data_type]}_t'

-    result['headers'] = ['<riscv_vector.h>']
+    result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']

    result['any'] += ' > 0x0'
-    result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
+    result['all'] += f' == {prefix}setvl_e{bits[data_type]}m1({vl})'
+
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'

    return result
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
 import os
 import platform
-from ctypes import CDLL
+from ctypes import CDLL, c_int, c_size_t, sizeof, byref
 from warnings import warn

 import numpy as np
@@ -22,7 +22,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):

    type_name = numpy_name_to_c(np.dtype(data_type).name)

-    if instruction_set in ['neon'] or instruction_set.startswith('sve'):
+    if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
        return get_vector_instruction_set_arm(type_name, instruction_set)
    elif instruction_set in ['vsx']:
        return get_vector_instruction_set_ppc(type_name, instruction_set)
@@ -38,21 +38,35 @@ def get_supported_instruction_sets():
    if 'PYSTENCILS_SIMD' in os.environ:
        return os.environ['PYSTENCILS_SIMD'].split(',')
    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
-        return ['neon']
+        result = ['neon']
+        libc = CDLL('/usr/lib/libc.dylib')
+        value = c_int(0)
+        size = c_size_t(sizeof(value))
+        status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
+        if status == 0 and value.value == 1:
+            result.insert(0, "sme")
+        return result
    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
        return ['neon']
    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
        result = ['neon']  # Neon is mandatory on 64-bit ARM
        libc = CDLL('libc.so.6')
        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
        if hwcap & (1 << 22):  # HWCAP_SVE
+            if hwcap2 & (1 << 1):  # HWCAP2_SVE2
+                name = 'sve2'
+            else:
+                name = 'sve'
            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
            if length < 0:
                raise OSError("SVE length query failed")
            while length >= 128:
-                result.append(f"sve{length}")
+                result.append(f"{name}{length}")
                length //= 2
-            result.append("sve")
+            result.append(name)
+        if hwcap2 & (1 << 23):  # HWCAP2_SME
+            result.insert(0, "sme")  # prepend to list so it is not automatically chosen as best instruction set
        return result
    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
        libc = CDLL('libc.so.6')

--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -35,11 +35,11 @@ class FlagInterface:
        >>> dh = create_data_handling((4, 5))
        >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
        >>> assert dh.has_data('flag_field')
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        2
-        >>> fi.reserve_flag(4)
+        >>> int(fi.reserve_flag(4))
        4
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        8
    """

@@ -450,5 +450,6 @@ def create_boundary_kernel(field, index_field, stencil, boundary_functor, target
    dir_symbol = TypedSymbol("dir", np.int32)
    elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
    elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
-    config = CreateKernelConfig(index_fields=[index_field], target=target, **kernel_creation_args)
+    config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
+                                **kernel_creation_args)
    return create_kernel(elements, config=config)
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -135,8 +135,9 @@ class CreateKernelConfig:
    """
    skip_independence_check: bool = False
    """
-    Don't check that loop iterations are independent. This is needed e.g. for
-    periodicity kernel, that access the field outside the iteration bounds. Use with care!
+    By default the assignment list is checked for read/write independence. This means fields are only written at
+    locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
+    periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
    """

    class DataTypeFactory:

--- a/src/pystencils/cpu/cpujit.py
+++ b/src/pystencils/cpu/cpujit.py
@@ -45,6 +45,7 @@ Then 'cl.exe' is used to compile.
 """
 from appdirs import user_cache_dir, user_config_dir
 from collections import OrderedDict
+from ctypes import CDLL
 import hashlib
 import importlib.util
 import json
@@ -57,12 +58,14 @@ import tempfile
 import textwrap
 import time
 import warnings
+import pathlib

 import numpy as np

 from pystencils import FieldType
 from pystencils.astnodes import LoopOverCoordinate
-from pystencils.backends.cbackend import generate_c, get_headers, CFunction
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
 from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
@@ -122,15 +125,15 @@ def get_configuration_file_path():

    # 1) Read path from environment variable if found
    if 'PYSTENCILS_CONFIG' in os.environ:
-        return os.environ['PYSTENCILS_CONFIG'], True
+        return os.environ['PYSTENCILS_CONFIG']
    # 2) Look in current directory for pystencils.json
    elif os.path.exists("pystencils.json"):
-        return "pystencils.json", True
+        return "pystencils.json"
    # 3) Try ~/.pystencils.json
    elif os.path.exists(config_path_in_home):
-        return config_path_in_home, True
+        return config_path_in_home
    else:
-        return config_path_in_home, False
+        return config_path_in_home


 def create_folder(path, is_file):
@@ -150,9 +153,36 @@ def read_config():
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
-        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
-                                                                                        '-mcpu=native')
+        if platform.machine().startswith('ppc64'):
+            # -mcpu=native is available, but only works when the compiler recognizes the exact CPU model. This fails in
+            # QEMU or when the compiler is older than the CPU, so we do our own detection here.
+            libc = CDLL('libc.so.6')
+            hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+            if hwcap2 & 0x00040000:  # PPC_FEATURE2_ARCH_3_1
+                flag = '-mcpu=power10'
+            elif hwcap2 & 0x00800000:  # PPC_FEATURE2_ARCH_3_00
+                flag = '-mcpu=power9'
+            elif hwcap2 & 0x80000000:  # PPC_FEATURE2_ARCH_2_07
+                flag = '-mcpu=power8'
+            else:
+                flag = '-mcpu=native'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', flag)
+        elif platform.machine() == 'aarch64':
+            # -mcpu=native is available, but only works when the compiler recognizes the exact CPU model. This fails in
+            # QEMU or when the compiler is older than the CPU, so we do our own detection here.
+            flag = '-march=armv8-a+crypto'
+            if any(i.startswith('sve2') and i not in ('sve256', 'sve2048') for i in get_supported_instruction_sets()):
+                flag += '+sve2'
+            elif any(i.startswith('sve') for i in get_supported_instruction_sets()):
+                flag += '+sve'
+            if 'sme' in get_supported_instruction_sets():
+                flag += '+sme'
+            if '+sve' in flag:
+                libc = CDLL('libc.so.6')
+                hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+                if hwcap2 & (1 << 2):  # HWCAP2_SVEAES
+                    flag += '+sve2-aes'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', flag)
    elif platform.system().lower() == 'windows':
        default_compiler_config = OrderedDict([
            ('os', 'windows'),
@@ -172,7 +202,11 @@ def read_config():
            ('restrict_qualifier', '__restrict__')
        ])
        if platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '')
+            if 'sme' in get_supported_instruction_sets():
+                flag = '-march=armv8.7-a+sme+aes '
+            else:
+                flag = ''
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
        for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
                       '/opt/homebrew/lib/libomp.dylib']:
            if os.path.exists(libomp):
@@ -190,16 +224,22 @@ def read_config():
    default_config = OrderedDict([('compiler', default_compiler_config),
                                  ('cache', default_cache_config)])

-    config_path, config_exists = get_configuration_file_path()
+    from fasteners import InterProcessLock
+
+    config_path = pathlib.Path(get_configuration_file_path())
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+    
    config = default_config.copy()
-    if config_exists:
-        with open(config_path, 'r') as json_config_file:
-            loaded_config = json.load(json_config_file)
-        config = recursive_dict_update(config, loaded_config)
-    else:
-        create_folder(config_path, True)
-        with open(config_path, 'w') as f:
-            json.dump(config, f, indent=4)
+
+    lockfile = config_path.with_suffix(config_path.suffix + ".lock")
+    with InterProcessLock(lockfile):
+        if config_path.exists():
+            with open(config_path, 'r') as json_config_file:
+                loaded_config = json.load(json_config_file)
+            config = recursive_dict_update(config, loaded_config)
+        else:
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=4)

    if config['cache']['object_cache'] is not False:
        config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
@@ -447,8 +487,6 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
            parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
        elif param.is_field_shape:
            parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
-        elif type(param.symbol) is CFunction:
-            continue
        else:
            extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
            pre_call_code += template_extract_scalar.format(extract_function=extract_function,
@@ -617,7 +655,12 @@ def compile_and_load(ast, custom_backend=None):
    cache_config = get_cache_config()

    compiler_config = get_compiler_config()
-    function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else ''
+    if compiler_config['os'].lower() == 'windows':
+        function_prefix = '__declspec(dllexport)'
+    elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
+        function_prefix = ast.instruction_set['function_prefix']
+    else:
+        function_prefix = ''

    code = ExtensionModuleCode(custom_backend=custom_backend)
    code.add_function(ast, ast.function_name)

--- a/src/pystencils/field.py
+++ b/src/pystencils/field.py
@@ -948,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0


 def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
-    if layout_str in ('fzyx', 'zyxf'):
-        assert dim <= 3
-        return tuple(reversed(range(dim)))
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    
+    layout_str = layout_str.lower()

-    if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'):
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
+        return tuple(reversed(range(dim)))
+    
+    if layout_str in ('f', 'reverse_numpy'):
        return tuple(reversed(range(dim)))
-    elif layout_str in ('c', 'numpy', 'AoS'):
+    elif layout_str in ('c', 'numpy'):
        return tuple(range(dim))
    raise ValueError("Unknown layout descriptor " + layout_str)


 def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    
    layout_str = layout_str.lower()
    if layout_str == 'fzyx' or layout_str == 'soa':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim)))
    elif layout_str == 'zyxf' or layout_str == 'aos':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim - 1))) + (dim - 1,)
    elif layout_str == 'f' or layout_str == 'reverse_numpy':
        return tuple(reversed(range(dim)))

--- a/src/pystencils/gpu/indexing.py
+++ b/src/pystencils/gpu/indexing.py
@@ -224,6 +224,9 @@ class BlockIndexing(AbstractIndexing):
            assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start

        if self._dim < 4:
            conditions = [c < e for c, e in zip(self.coordinates, end)]

--- a/src/pystencils/gpu/kernelcreation.py
+++ b/src/pystencils/gpu/kernelcreation.py
@@ -66,15 +66,18 @@ def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
        iteration_space = normalize_slice(iteration_slice, common_shape)
    else:
        iteration_space = normalize_slice(iteration_slice, common_shape)
-    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s, 1) for s in iteration_space])
+    
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])

    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]

    if len(indexed_elements) > 0:
        common_indexed_element = get_common_indexed_element(indexed_elements)
+        index = common_indexed_element.indices[0].atoms(TypedSymbol)
+        assert len(index) == 1, "index expressions must only contain one symbol representing the index"
        indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
                                    data_layout=common_field.layout)
-        extended_ctrs = [common_indexed_element.indices[0], *loop_counter_symbols]
+        extended_ctrs = [index.pop(), *loop_counter_symbols]
        loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
    else:
        indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)

--- a/src/pystencils/include/PyStencilsField.h
+++ b/src/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
No results found