Michael Kuron · Michael Kuron · Stephan Seitz · Stephan Seitz · Michael Kuron · Martin Bauer
--- a/conftest.py
+++ b/conftest.py
@@ -3,6 +3,7 @@ import pytest
 import tempfile
 import runpy
 import sys
+import warnings
 # Trigger config file reading / creation once - to avoid race conditions when multiple instances are creating it
 # at the same time
 from pystencils.cpu import cpujit
@@ -128,8 +129,11 @@ class IPyNbFile(pytest.File):
        exporter.exclude_input_prompt = True

        notebook_contents = self.fspath.open()
-        notebook = nbformat.read(notebook_contents, 4)
-        code, _ = exporter.from_notebook_node(notebook)
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "IPython.core.inputsplitter is deprecated")
+            notebook = nbformat.read(notebook_contents, 4)
+            code, _ = exporter.from_notebook_node(notebook)
        yield IPyNbTest(self.name, self, code)

    def teardown(self):

--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -310,6 +310,7 @@ class Block(Node):

    def insert_before(self, new_node, insert_before):
        new_node.parent = self
+        assert self._nodes.count(insert_before) == 1
        idx = self._nodes.index(insert_before)

        # move all assignment (definitions to the top)
@@ -337,6 +338,7 @@ class Block(Node):
        return tmp

    def replace(self, child, replacements):
+        assert self._nodes.count(child) == 1
        idx = self._nodes.index(child)
        del self._nodes[idx]
        if type(replacements) is list:
@@ -516,12 +518,13 @@ class LoopOverCoordinate(Node):


 class SympyAssignment(Node):
-    def __init__(self, lhs_symbol, rhs_expr, is_const=True):
+    def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
        super(SympyAssignment, self).__init__(parent=None)
        self._lhs_symbol = lhs_symbol
        self.rhs = sp.sympify(rhs_expr)
        self._is_const = is_const
        self._is_declaration = self.__is_declaration()
+        self.use_auto = use_auto

    def __is_declaration(self):
        if isinstance(self._lhs_symbol, cast_func):

--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -4,16 +4,17 @@ from typing import Set
 import numpy as np
 import sympy as sp
 from sympy.core import S
+from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 from sympy.printing.ccode import C89CodePrinter
+
 from pystencils.astnodes import KernelFunction, Node
 from pystencils.cpu.vectorization import vec_all, vec_any
 from pystencils.data_types import (
-    PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
-    reinterpret_cast_func, vector_memory_access)
+    PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression, reinterpret_cast_func,
+    vector_memory_access)
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.integer_functions import (
-    bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
-    int_div, int_power_of_2, modulo_ceil)
+    bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor, int_div, int_power_of_2, modulo_ceil)

 try:
    from sympy.printing.ccode import C99CodePrinter as CCodePrinter
@@ -32,9 +33,9 @@ def generate_c(ast_node: Node,
               with_globals=True) -> str:
    """Prints an abstract syntax tree node as C or CUDA code.

-    This function does not need to distinguish between C, C++ or CUDA code, it just prints 'C-like' code as encoded
-    in the abstract syntax tree (AST). The AST is built differently for C or CUDA by calling different create_kernel
-    functions.
+    This function does not need to distinguish for most AST nodes between C, C++ or CUDA code, it just prints 'C-like'
+    code as encoded in the abstract syntax tree (AST). The AST is built differently for C or CUDA by calling different
+    create_kernel functions.

    Args:
        ast_node:
@@ -229,11 +230,15 @@ class CBackend:

    def _print_SympyAssignment(self, node):
        if node.is_declaration:
-            if node.is_const:
-                prefix = 'const '
+            if node.use_auto:
+                data_type = 'auto '
            else:
-                prefix = ''
-            data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
+                if node.is_const:
+                    prefix = 'const '
+                else:
+                    prefix = ''
+                data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
+
            return "%s%s = %s;" % (data_type,
                                   self.sympy_printer.doprint(node.lhs),
                                   self.sympy_printer.doprint(node.rhs))
@@ -292,6 +297,10 @@ class CBackend:
        return ""

    def _print_Conditional(self, node):
+        if type(node.condition_expr) is BooleanTrue:
+            return self._print_Block(node.true_block)
+        elif type(node.condition_expr) is BooleanFalse:
+            return self._print_Block(node.false_block)
        cond_type = get_type_of_expression(node.condition_expr)
        if isinstance(cond_type, VectorType):
            raise ValueError("Problem with Conditional inside vectorized loop - use vec_any or vec_all")
@@ -381,7 +390,8 @@ class CustomSympyPrinter(CCodePrinter):
        elif expr.func == int_div:
            return "((%s) / (%s))" % (self._print(expr.args[0]), self._print(expr.args[1]))
        else:
-            return super(CustomSympyPrinter, self)._print_Function(expr)
+            arg_str = ', '.join(self._print(a) for a in expr.args)
+            return f'{expr.name}({arg_str})'

    def _typed_number(self, number, dtype):
        res = self._print(number)

--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
@@ -34,6 +34,7 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
                      transformation :func:`pystencils.transformation.split_inner_loop`
        iteration_slice: if not None, iteration is done only over this slice of the field
        ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
+                      that should be excluded from the iteration.
                     if None, the number of ghost layers is determined automatically and assumed to be equal for a
                     all dimensions
        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for

--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -2,18 +2,17 @@ import warnings
 from typing import Container, Union

 import sympy as sp
+from sympy.logic.boolalg import BooleanFunction

 import pystencils.astnodes as ast
 from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
 from pystencils.data_types import (
-    PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression,
-    vector_memory_access)
+    PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression, vector_memory_access)
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.field import Field
 from pystencils.integer_functions import modulo_ceil, modulo_floor
 from pystencils.sympyextensions import fast_subs
-from pystencils.transformations import (
-    cut_loop, filtered_tree_iteration, replace_inner_stride_with_one)
+from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one


 # noinspection PyPep8Naming
@@ -177,7 +176,7 @@ def insert_vector_casts(ast_node):
                                        visit_expr(expr.args[4]))
        elif isinstance(expr, cast_func):
            return expr
-        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, sp.boolalg.BooleanFunction):
+        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
            new_args = [visit_expr(a) for a in expr.args]
            arg_types = [get_type_of_expression(a) for a in new_args]
            if not any(type(t) is VectorType for t in arg_types):

--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
@@ -4,14 +4,14 @@ from functools import partial
 from typing import Tuple

 import numpy as np
-
-import pystencils
 import sympy as sp
 import sympy.codegen.ast
+from sympy.core.cache import cacheit
+from sympy.logic.boolalg import Boolean, BooleanFunction
+
+import pystencils
 from pystencils.cache import memorycache, memorycache_if_hashable
 from pystencils.utils import all_equal
-from sympy.core.cache import cacheit
-from sympy.logic.boolalg import Boolean

 try:
    import llvmlite.ir as ir
@@ -541,14 +541,20 @@ def get_type_of_expression(expr,
    elif isinstance(expr, sp.Indexed):
        typed_symbol = expr.base.label
        return typed_symbol.dtype.base_type
-    elif isinstance(expr, (sp.boolalg.Boolean, sp.boolalg.BooleanFunction)):
+    elif isinstance(expr, (Boolean, BooleanFunction)):
        # if any arg is of vector type return a vector boolean, else return a normal scalar boolean
        result = create_type("bool")
        vec_args = [get_type(a) for a in expr.args if isinstance(get_type(a), VectorType)]
        if vec_args:
            result = VectorType(result, width=vec_args[0].width)
        return result
-    elif isinstance(expr, (sp.Pow, sp.Sum, sp.Product)):
+    elif isinstance(expr, sp.Pow):
+        base_type = get_type(expr.args[0])
+        if expr.exp.is_integer:
+            return base_type
+        else:
+            return collate_types([create_type(default_float_type), base_type])
+    elif isinstance(expr, (sp.Sum, sp.Product)):
        return get_type(expr.args[0])
    elif isinstance(expr, sp.Expr):
        expr: sp.Expr
@@ -821,3 +827,6 @@ class TypedImaginaryUnit(TypedSymbol):

    __xnew__ = staticmethod(__new_stage2__)
    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+
+    def __getnewargs__(self):
+        return (self.dtype,)
--- a/pystencils/datahandling/parallel_datahandling.py
+++ b/pystencils/datahandling/parallel_datahandling.py
@@ -109,11 +109,14 @@ class ParallelDataHandling(DataHandling):
        if hasattr(values_per_cell, '__len__'):
            raise NotImplementedError("Parallel data handling does not support multiple index dimensions")

-        self._fieldInformation[name] = {'ghost_layers': ghost_layers,
-                                        'values_per_cell': values_per_cell,
-                                        'layout': layout,
-                                        'dtype': dtype,
-                                        'alignment': alignment}
+        self._fieldInformation[name] = {
+            'ghost_layers': ghost_layers,
+            'values_per_cell': values_per_cell,
+            'layout': layout,
+            'dtype': dtype,
+            'alignment': alignment,
+            'field_type': field_type,
+        }

        layout_map = {'fzyx': wlb.field.Layout.fzyx, 'zyxf': wlb.field.Layout.zyxf,
                      'f': wlb.field.Layout.fzyx,

--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -100,6 +100,7 @@ class SerialDataHandling(DataHandling):
            'layout': layout,
            'dtype': dtype,
            'alignment': alignment,
+            'field_type': field_type,
        }

        index_dimensions = len(values_per_cell)

--- a/pystencils/fd/derivation.py
+++ b/pystencils/fd/derivation.py
 import warnings
 from collections import defaultdict
+import itertools

 import numpy as np
 import sympy as sp

 from pystencils.field import Field
+from pystencils.stencil import direction_string_to_offset
 from pystencils.sympyextensions import multidimensional_sum, prod
 from pystencils.utils import LinearEquationSystem, fully_contains

@@ -228,3 +230,120 @@ class FiniteDifferenceStencilDerivation:
        def __repr__(self):
            return "Finite difference stencil of accuracy {}, isotropic error: {}".format(self.accuracy,
                                                                                          self.is_isotropic)
+
+
+class FiniteDifferenceStaggeredStencilDerivation:
+    """Derives a finite difference stencil for application at a staggered position
+
+    Args:
+        neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative
+        dim: how many dimensions (2 or 3)
+        derivative: a tuple of directions over which to perform derivatives
+    """
+
+    def __init__(self, neighbor, dim, derivative=tuple()):
+        if type(neighbor) is str:
+            neighbor = direction_string_to_offset(neighbor)
+        if dim == 2:
+            assert neighbor[dim:] == 0
+        assert derivative is tuple() or max(derivative) < dim
+        neighbor = sp.Matrix(neighbor[:dim])
+        pos = neighbor / 2
+
+        def unitvec(i):
+            """return the `i`-th unit vector in three dimensions"""
+            a = np.zeros(dim, dtype=int)
+            a[i] = 1
+            return a
+
+        def flipped(a, i):
+            """return `a` with its `i`-th element's sign flipped"""
+            a = a.copy()
+            a[i] *= -1
+            return a
+
+        # determine the points to use, coordinates are relative to position
+        points = []
+        if np.linalg.norm(neighbor, 1) == 1:
+            main_points = [neighbor / 2, neighbor / -2]
+        elif np.linalg.norm(neighbor, 1) == 2:
+            nonzero_indices = [i for i, v in enumerate(neighbor) if v != 0 and i < dim]
+            main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]),
+                           flipped(neighbor / -2, nonzero_indices[0])]
+        else:
+            main_points = [neighbor.multiply_elementwise(sp.Matrix(c) / 2)
+                           for c in itertools.product([-1, 1], repeat=3)]
+        points += main_points
+        zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim]
+        for i in zero_indices:
+            points += [point + sp.Matrix(unitvec(i)) for point in main_points]
+            points += [point - sp.Matrix(unitvec(i)) for point in main_points]
+        points_tuple = tuple([tuple(p) for p in points])
+        self._stencil = points_tuple
+
+        # determine the stencil weights
+        if len(derivative) == 0:
+            weights = None
+        else:
+            derivation = FiniteDifferenceStencilDerivation(derivative, points_tuple).get_stencil()
+            if not derivation.accuracy:
+                raise Exception('the requested derivative cannot be performed with the available neighbors')
+            weights = derivation.weights
+
+            # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
+            free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
+            if len(free_weights) > 0:
+                zero_counts = defaultdict(list)
+                for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
+                                                repeat=len(free_weights)):
+                    subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                    weights = [w.subs(subs) for w in derivation.weights]
+                    if not all(a == 0 for a in weights):
+                        zero_count = sum([1 for w in weights if w == 0])
+                        zero_counts[zero_count].append(weights)
+                best = zero_counts[max(zero_counts.keys())]
+                if len(best) > 1:  # if there are multiple, pick the one that contains a nonzero center weight
+                    center = [tuple(p + pos) for p in points].index((0, 0, 0))
+                    best = [b for b in best if b[center] != 0]
+                if len(best) > 1:
+                    raise NotImplementedError("more than one suitable set of weights found, don't know how to proceed")
+                weights = best[0]
+                assert weights
+
+        points_tuple = tuple([tuple(p + pos) for p in points])
+        self._points = points_tuple
+        self._weights = weights
+
+    @property
+    def points(self):
+        """return the points of the stencil"""
+        return self._points
+
+    @property
+    def stencil(self):
+        """return the points of the stencil relative to the staggered position specified by neighbor"""
+        return self._stencil
+
+    @property
+    def weights(self):
+        """return the weights of the stencil"""
+        assert self._weights is not None
+        return self._weights
+
+    def visualize(self):
+        if self._weights is None:
+            ws = None
+        else:
+            ws = np.array([w for w in self.weights if w != 0], dtype=float)
+        pts = np.array([p for i, p in enumerate(self.points) if self.weights[i] != 0], dtype=int)
+        from pystencils.stencil import plot
+        plot(pts, data=ws)
+
+    def apply(self, field):
+        if field.index_dimensions == 0:
+            return sum([field.__getitem__(point) * weight for point, weight in zip(self.points, self.weights)])
+        else:
+            total = field.neighbor_vector(self.points[0]) * self.weights[0]
+            for point, weight in zip(self.points[1:], self.weights[1:]):
+                total += field.neighbor_vector(point) * weight
+            return total
--- a/pystencils/field.py
+++ b/pystencils/field.py
@@ -15,7 +15,7 @@ import pystencils
 from pystencils.alignedarray import aligned_empty
 from pystencils.data_types import StructType, TypedSymbol, create_type
 from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol
-from pystencils.stencil import direction_string_to_offset, offset_to_direction_string
+from pystencils.stencil import direction_string_to_offset, offset_to_direction_string, inverse_direction
 from pystencils.sympyextensions import is_integer_sequence

 __all__ = ['Field', 'fields', 'FieldType', 'AbstractField']
@@ -34,6 +34,8 @@ class FieldType(Enum):
    CUSTOM = 3
    # staggered field
    STAGGERED = 4
+    # staggered field that reverses sign when accessed via opposite direction
+    STAGGERED_FLUX = 5

    @staticmethod
    def is_generic(field):
@@ -58,7 +60,12 @@ class FieldType(Enum):
    @staticmethod
    def is_staggered(field):
        assert isinstance(field, Field)
-        return field.field_type == FieldType.STAGGERED
+        return field.field_type == FieldType.STAGGERED or field.field_type == FieldType.STAGGERED_FLUX
+
+    @staticmethod
+    def is_staggered_flux(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.STAGGERED_FLUX


 def fields(description=None, index_dimensions=0, layout=None, field_type=FieldType.GENERIC, **kwargs):
@@ -83,7 +90,7 @@ def fields(description=None, index_dimensions=0, layout=None, field_type=FieldTy

        Format string can be left out, field names are taken from keyword arguments.
            >>> fields(f1=arr_s, f2=arr_s)
-            [f1, f2]
+            [f1: double[20,20], f2: double[20,20]]

        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
            >>> f = fields(f=arr_v, index_dimensions=1)
@@ -392,7 +399,19 @@ class Field(AbstractField):
        return self.dtype.numpy_dtype.itemsize

    def __repr__(self):
-        return self._field_name
+        if any(isinstance(s, sp.Symbol) for s in self.spatial_shape):
+            spatial_shape_str = f'{self.spatial_dimensions}d'
+        else:
+            spatial_shape_str = ','.join(str(i) for i in self.spatial_shape)
+        index_shape_str = ','.join(str(i) for i in self.index_shape)
+
+        if self.index_shape:
+            return f'{self._field_name}({index_shape_str}): {self.dtype}[{spatial_shape_str}]'
+        else:
+            return f'{self._field_name}: {self.dtype}[{spatial_shape_str}]'
+
+    def __str__(self):
+        return self.name

    def neighbor(self, coord_id, offset):
        offset_list = [0] * self.spatial_dimensions
@@ -407,19 +426,37 @@ class Field(AbstractField):
        index_shape = self.index_shape
        if len(index_shape) == 0:
            return sp.Matrix([self.center])
-        if len(index_shape) == 1:
+        elif len(index_shape) == 1:
            return sp.Matrix([self(i) for i in range(index_shape[0])])
        elif len(index_shape) == 2:
-            def cb(*args):
-                r = self.__call__(*args)
-                return r
-            return sp.Matrix(*index_shape, cb)
+            return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
+        elif len(index_shape) == 3:
+            return sp.Matrix([[[self(i, j, k) for k in range(index_shape[2])]
+                             for j in range(index_shape[1])] for i in range(index_shape[0])])
+        else:
+            raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")

    @property
    def center(self):
        center = tuple([0] * self.spatial_dimensions)
        return Field.Access(self, center)

+    def neighbor_vector(self, offset):
+        """Like neighbor, but returns the entire vector/tensor stored at offset."""
+        if self.spatial_dimensions == 2 and len(offset) == 3:
+            assert offset[2] == 0
+            offset = offset[:2]
+
+        if self.index_dimensions == 0:
+            return sp.Matrix([self.__getitem__(offset)])
+        elif self.index_dimensions == 1:
+            return sp.Matrix([self.__getitem__(offset)(i) for i in range(self.index_shape[0])])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([[self.__getitem__(offset)(i, k) for k in range(self.index_shape[1])]
+                             for i in range(self.index_shape[0])])
+        else:
+            raise NotImplementedError("neighbor_vector is not implemented for more than 2 index dimensions")
+
    def __getitem__(self, offset):
        if type(offset) is np.ndarray:
            offset = tuple(offset)
@@ -466,6 +503,7 @@ class Field(AbstractField):
        """
        assert FieldType.is_staggered(self)

+        offset_orig = offset
        if type(offset) is np.ndarray:
            offset = tuple(offset)
        if type(offset) is str:
@@ -477,20 +515,29 @@ class Field(AbstractField):
            raise ValueError("Wrong number of spatial indices: "
                             "Got %d, expected %d" % (len(offset), self.spatial_dimensions))

-        offset = list(offset)
-        neighbor = [0] * len(offset)
-        for i, o in enumerate(offset):
-            if (o + sp.Rational(1, 2)).is_Integer:
-                offset[i] += sp.Rational(1, 2)
-                neighbor[i] = 1
-        neighbor = offset_to_direction_string(neighbor)
+        prefactor = 1
+        neighbor_vec = [0] * len(offset)
+        for i in range(self.spatial_dimensions):
+            if (offset[i] + sp.Rational(1, 2)).is_Integer:
+                neighbor_vec[i] = sp.sign(offset[i])
+        neighbor = offset_to_direction_string(neighbor_vec)
+        if neighbor not in self.staggered_stencil:
+            neighbor_vec = inverse_direction(neighbor_vec)
+            neighbor = offset_to_direction_string(neighbor_vec)
+            if FieldType.is_staggered_flux(self):
+                prefactor = -1
+        if neighbor not in self.staggered_stencil:
+            raise ValueError("{} is not a valid neighbor for the {} stencil".format(offset_orig,
+                             self.staggered_stencil_name))
+
+        offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
+
        idx = self.staggered_stencil.index(neighbor)
-        offset = tuple(offset)

        if self.index_dimensions == 1:  # this field stores a scalar value at each staggered position
            if index is not None:
                raise ValueError("Cannot specify an index for a scalar staggered field")
-            return Field.Access(self, offset, (idx,))
+            return prefactor * Field.Access(self, offset, (idx,))
        else:  # this field stores a vector or tensor at each staggered position
            if index is None:
                raise ValueError("Wrong number of indices: "
@@ -503,34 +550,58 @@ class Field(AbstractField):
                raise ValueError("Wrong number of indices: "
                                 "Got %d, expected %d" % (len(index), self.index_dimensions - 1))

-            return Field.Access(self, offset, (idx, *index))
+            return prefactor * Field.Access(self, offset, (idx, *index))
+
+    def staggered_vector_access(self, offset):
+        """Like staggered_access, but returns the entire vector/tensor stored at offset."""
+        assert FieldType.is_staggered(self)
+
+        if self.index_dimensions == 1:
+            return sp.Matrix([self.staggered_access(offset)])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([self.staggered_access(offset, i) for i in range(self.index_shape[1])])
+        elif self.index_dimensions == 3:
+            return sp.Matrix([[self.staggered_access(offset, (i, k)) for k in range(self.index_shape[2])]
+                             for i in range(self.index_shape[1])])
+        else:
+            raise NotImplementedError("staggered_vector_access is not implemented for more than 3 index dimensions")

    @property
    def staggered_stencil(self):
        assert FieldType.is_staggered(self)
        stencils = {
            2: {
-                2: ["E", "N"],  # D2Q5
-                4: ["E", "N", "NE", "SE"]  # D2Q9
+                2: ["W", "S"],  # D2Q5
+                4: ["W", "S", "SW", "NW"]  # D2Q9
            },
            3: {
-                3: ["E", "N", "T"],  # D3Q7
-                7: ["E", "N", "T", "TNE", "BNE", "TSE", "BSE "],  # D3Q15
-                9: ["E", "N", "T", "NE", "SE", "TE", "BE", "TN", "BN"],  # D3Q19
-                13: ["E", "N", "T", "NE", "SE", "TE", "BE", "TN", "BN", "TNE", "BNE", "TSE", "BSE"]  # D3Q27
+                3: ["W", "S", "B"],  # D3Q7
+                7: ["W", "S", "B", "BSW", "TSW", "BNW", "TNW"],  # D3Q15
+                9: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS"],  # D3Q19
+                13: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS", "BSW", "TSW", "BNW", "TNW"]  # D3Q27
            }
        }
        if not self.index_shape[0] in stencils[self.spatial_dimensions]:
            raise ValueError("No known stencil has {} staggered points".format(self.index_shape[0]))
        return stencils[self.spatial_dimensions][self.index_shape[0]]

+    @property
+    def staggered_stencil_name(self):
+        assert FieldType.is_staggered(self)
+        return "D%dQ%d" % (self.spatial_dimensions, self.index_shape[0] * 2 + 1)
+
    def __call__(self, *args, **kwargs):
        center = tuple([0] * self.spatial_dimensions)
        return Field.Access(self, center)(*args, **kwargs)

    def hashable_contents(self):
-        dth = hash(self._dtype)
-        return self._layout, self.shape, self.strides, dth, self.field_type, self._field_name, self.latex_name
+        return (self._layout,
+                self.shape,
+                self.strides,
+                self.field_type,
+                self._field_name,
+                self.latex_name,
+                self._dtype)

    def __hash__(self):
        return hash(self.hashable_contents())
@@ -774,7 +845,7 @@ class Field(AbstractField):
            assert FieldType.is_staggered(self._field)
            neighbor = self._field.staggered_stencil[index]
            neighbor = direction_string_to_offset(neighbor, self._field.spatial_dimensions)
-            return [(o - sp.Rational(neighbor[i], 2)) for i, o in enumerate(offsets)]
+            return [(o - sp.Rational(int(neighbor[i]), 2)) for i, o in enumerate(offsets)]

        def _latex(self, _):
            n = self._field.latex_name if self._field.latex_name else self._field.name

--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
-import itertools
 from types import MappingProxyType
+from itertools import combinations

 import sympy as sp

 from pystencils.assignment import Assignment
 from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment
 from pystencils.cpu.vectorization import vectorize
+from pystencils.field import Field, FieldType
 from pystencils.gpucuda.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection
+from pystencils.stencil import direction_string_to_offset, inverse_direction_string
 from pystencils.transformations import (
    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)

@@ -23,7 +25,9 @@ def create_kernel(assignments,
                  cpu_blocking=None,
                  gpu_indexing='block',
                  gpu_indexing_params=MappingProxyType({}),
-                  use_textures_for_interpolation=True):
+                  use_textures_for_interpolation=True,
+                  cpu_prepend_optimizations=[],
+                  use_auto_for_assignments=False):
    """
    Creates abstract syntax tree (AST) of kernel, using a list of update equations.

@@ -34,9 +38,9 @@ def create_kernel(assignments,
                  to type
        iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
                         part of the field is iterated over
-        ghost_layers: if left to default, the number of necessary ghost layers is determined automatically
-                     a single integer specifies the ghost layer count at all borders, can also be a sequence of
-                     pairs ``[(x_lower_gl, x_upper_gl), .... ]``
+        ghost_layers: a single integer specifies the ghost layer count at all borders, can also be a sequence of
+                      pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
+                      If left to default, the number of ghost layers is determined automatically.
        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
                                 periodicity kernel, that access the field outside the iteration bounds. Use with care!
        cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
@@ -47,6 +51,7 @@ def create_kernel(assignments,
        gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
        gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
                             e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
+        cpu_prepend_optimizations: list of extra optimizations to perform first on the AST

    Returns:
        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
@@ -84,6 +89,8 @@ def create_kernel(assignments,
        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
                            iteration_slice=iteration_slice, ghost_layers=ghost_layers,
                            skip_independence_check=skip_independence_check)
+        for optimization in cpu_prepend_optimizations:
+            optimization(ast)
        omp_collapse = None
        if cpu_blocking:
            omp_collapse = loop_blocking(ast, cpu_blocking)
@@ -96,12 +103,10 @@ def create_kernel(assignments,
                vectorize(ast, **cpu_vectorize_info)
            else:
                raise ValueError("Invalid value for cpu_vectorize_info")
-        return ast
    elif target == 'llvm':
        from pystencils.llvm import create_kernel
        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
                            iteration_slice=iteration_slice, ghost_layers=ghost_layers)
-        return ast
    elif target == 'gpu':
        from pystencils.gpucuda import create_cuda_kernel
        ast = create_cuda_kernel(assignments, type_info=data_type,
@@ -109,10 +114,15 @@ def create_kernel(assignments,
                                 iteration_slice=iteration_slice, ghost_layers=ghost_layers,
                                 skip_independence_check=skip_independence_check,
                                 use_textures_for_interpolation=use_textures_for_interpolation)
-        return ast
    else:
        raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))

+    if use_auto_for_assignments:
+        for a in ast.atoms(SympyAssignment):
+            a.use_auto = True
+
+    return ast
+

 def create_indexed_kernel(assignments,
                          index_fields,
@@ -186,104 +196,132 @@ def create_indexed_kernel(assignments,
        raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))


-def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu',
-                            gpu_exclusive_conditions=False, **kwargs):
+def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions=False, **kwargs):
    """Kernel that updates a staggered field.

    .. image:: /img/staggered_grid.svg

+    For a staggered field, the first index coordinate defines the location of the staggered value.
+    Further index coordinates can be used to store vectors/tensors at each point.
+
    Args:
-        staggered_field: field where the first index coordinate defines the location of the staggered value
-                can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location
-                a vector is stored, expressions parameter has to be a sequence of sequences then
-                where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell
-                boundary and ``f[0,0](1)`` the southern cell boundary etc.
-        expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary
-                     should be updated.
-        subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions
-        target: 'cpu' or 'gpu'
-        gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths
-        kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed
+        assignments: a sequence of assignments or an AssignmentCollection.
+                     Assignments to staggered field are processed specially, while subexpressions and assignments to
+                     regular fields are passed through to `create_kernel`. Multiple different staggered fields can be
+                     used, but they all need to use the same stencil (i.e. the same number of staggered points) and
+                     shape.
+        target: 'cpu', 'llvm' or 'gpu'
+        gpu_exclusive_conditions: disable the use of multiple conditionals inside the loop. The outer layers are then
+                                  handled in an else branch.
+        kwargs: passed directly to create_kernel, iteration_slice and ghost_layers parameters are not allowed

    Returns:
        AST, see `create_kernel`
    """
    assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs
-    assert staggered_field.index_dimensions in (1, 2), 'Staggered field must have one or two index dimensions'
+
+    if isinstance(assignments, AssignmentCollection):
+        subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments
+                                                       if not hasattr(a, 'lhs')
+                                                       or type(a.lhs) is not Field.Access
+                                                       or not FieldType.is_staggered(a.lhs.field)]
+        assignments = [a for a in assignments.main_assignments if hasattr(a, 'lhs')
+                       and type(a.lhs) is Field.Access
+                       and FieldType.is_staggered(a.lhs.field)]
+    else:
+        subexpressions = [a for a in assignments if not hasattr(a, 'lhs')
+                          or type(a.lhs) is not Field.Access
+                          or not FieldType.is_staggered(a.lhs.field)]
+        assignments = [a for a in assignments if hasattr(a, 'lhs')
+                       and type(a.lhs) is Field.Access
+                       and FieldType.is_staggered(a.lhs.field)]
+    if len(set([tuple(a.lhs.field.staggered_stencil) for a in assignments])) != 1:
+        raise ValueError("All assignments need to be made to staggered fields with the same stencil")
+    if len(set([a.lhs.field.shape for a in assignments])) != 1:
+        raise ValueError("All assignments need to be made to staggered fields with the same shape")
+
+    staggered_field = assignments[0].lhs.field
+    stencil = staggered_field.staggered_stencil
    dim = staggered_field.spatial_dimensions
+    shape = staggered_field.shape

    counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
-    conditions = [counters[i] < staggered_field.shape[i] - 1 for i in range(dim)]
-    assert len(expressions) == dim
-    if staggered_field.index_dimensions == 2:
-        assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \
-            "If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \
-            "same length."

    final_assignments = []
-    last_conditional = None
-
-    def add(condition, dimensions, as_else_block=False):
-        nonlocal last_conditional
-        if staggered_field.index_dimensions == 1:
-            assignments = [Assignment(staggered_field(d), expressions[d]) for d in dimensions]
-            a_coll = AssignmentCollection(assignments, list(subexpressions))
-            a_coll = a_coll.new_filtered([staggered_field(d) for d in dimensions])
-        elif staggered_field.index_dimensions == 2:
-            assert staggered_field.has_fixed_index_shape
-            assignments = [Assignment(staggered_field(d, i), expr)
-                           for d in dimensions
-                           for i, expr in enumerate(expressions[d])]
-            a_coll = AssignmentCollection(assignments, list(subexpressions))
-            a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])
-                                          for d in dimensions])
-        sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments]
-        if as_else_block and last_conditional:
-            new_cond = Conditional(condition, Block(sp_assignments))
-            last_conditional.false_block = Block([new_cond])
-            last_conditional = new_cond
-        else:
-            last_conditional = Conditional(condition, Block(sp_assignments))
-            final_assignments.append(last_conditional)

-    if target == 'cpu' or not gpu_exclusive_conditions:
-        for d in range(dim):
-            cond = sp.And(*[conditions[i] for i in range(dim) if d != i])
-            add(cond, [d])
-    elif target == 'gpu':
-        full_conditions = [sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim)]
-        for include in itertools.product(*[[1, 0]] * dim):
-            case_conditions = sp.And(*[c if value else sp.Not(c) for c, value in zip(full_conditions, include)])
-            dimensions_to_include = [i for i in range(dim) if include[i]]
-            if dimensions_to_include:
-                add(case_conditions, dimensions_to_include, True)
+    # find out whether any of the ghost layers is not needed
+    common_exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
+    for direction in stencil:
+        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
+        for elementary_direction in direction:
+            exclusions.remove(inverse_direction_string(elementary_direction))
+        common_exclusions.intersection_update(exclusions)
+    ghost_layers = [[0, 0] for d in range(dim)]
+    for direction in common_exclusions:
+        direction = direction_string_to_offset(direction)
+        for d, s in enumerate(direction):
+            if s == 1:
+                ghost_layers[d][1] = 1
+            elif s == -1:
+                ghost_layers[d][0] = 1

-    ghost_layers = [(1, 0)] * dim
+    def condition(direction):
+        """exclude those staggered points that correspond to fluxes between ghost cells"""
+        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])

-    blocking = kwargs.get('cpu_blocking', None)
-    if blocking:
-        del kwargs['cpu_blocking']
+        for elementary_direction in direction:
+            exclusions.remove(inverse_direction_string(elementary_direction))
+        conditions = []
+        for e in exclusions:
+            if e in common_exclusions:
+                continue
+            offset = direction_string_to_offset(e)
+            for i, o in enumerate(offset):
+                if o == 1:
+                    conditions.append(counters[i] < shape[i] - 1)
+                elif o == -1:
+                    conditions.append(counters[i] > 0)
+        return sp.And(*conditions)

-    cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
-    if cpu_vectorize_info:
-        del kwargs['cpu_vectorize_info']
-    openmp = kwargs.get('cpu_openmp', None)
-    if openmp:
-        del kwargs['cpu_openmp']
+    if gpu_exclusive_conditions:
+        outer_assignment = None
+        conditions = {direction: condition(direction) for direction in stencil}
+        for num_conditions in range(len(stencil)):
+            for combination in combinations(conditions.values(), num_conditions):
+                for assignment in assignments:
+                    direction = stencil[assignment.lhs.index[0]]
+                    if conditions[direction] in combination:
+                        assignment = SympyAssignment(assignment.lhs, assignment.rhs)
+                        outer_assignment = Conditional(sp.And(*combination), Block([assignment]), outer_assignment)

-    ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
+        inner_assignment = []
+        for assignment in assignments:
+            direction = stencil[assignment.lhs.index[0]]
+            inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs))
+        last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]),
+                                       Block(inner_assignment), outer_assignment)
+        final_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
+                            [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
+                            [last_conditional]

-    if target == 'cpu':
-        remove_conditionals_in_staggered_kernel(ast)
-        move_constants_before_loop(ast)
-        omp_collapse = None
-        if blocking:
-            omp_collapse = loop_blocking(ast, blocking)
-        if openmp:
-            from pystencils.cpu import add_openmp
-            add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
-        if cpu_vectorize_info is True:
-            vectorize(ast)
-        elif isinstance(cpu_vectorize_info, dict):
-            vectorize(ast, **cpu_vectorize_info)
+        if target == 'cpu':
+            from pystencils.cpu import create_kernel as create_kernel_cpu
+            ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, **kwargs)
+        else:
+            ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
+        return ast
+
+    for assignment in assignments:
+        direction = stencil[assignment.lhs.index[0]]
+        sp_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
+                         [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
+                         [SympyAssignment(assignment.lhs, assignment.rhs)]
+        last_conditional = Conditional(condition(direction), Block(sp_assignments))
+        final_assignments.append(last_conditional)
+
+    remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers])
+    prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional),
+                             move_constants_before_loop]
+    ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target,
+                        cpu_prepend_optimizations=prepend_optimizations, **kwargs)
    return ast
--- a/pystencils/stencil.py
+++ b/pystencils/stencil.py
@@ -16,6 +16,11 @@ def inverse_direction(direction):
    return tuple([-i for i in direction])


+def inverse_direction_string(direction):
+    """Returns inverse of given direction string"""
+    return offset_to_direction_string(inverse_direction(direction_string_to_offset(direction)))
+
+
 def is_valid(stencil, max_neighborhood=None):
    """
    Tests if a nested sequence is a valid stencil i.e. all the inner sequences have the same length.

--- a/pystencils/transformations.py
+++ b/pystencils/transformations.py
@@ -8,14 +8,14 @@ from types import MappingProxyType
 import numpy as np
 import sympy as sp
 from sympy.core.numbers import ImaginaryUnit
-from sympy.logic.boolalg import Boolean
+from sympy.logic.boolalg import Boolean, BooleanFunction

 import pystencils.astnodes as ast
 import pystencils.integer_functions
 from pystencils.assignment import Assignment
 from pystencils.data_types import (
-    PointerType, StructType, TypedImaginaryUnit, TypedSymbol, cast_func, collate_types, create_type,
-    get_base_type, get_type_of_expression, pointer_arithmetic_func, reinterpret_cast_func)
+    PointerType, StructType, TypedImaginaryUnit, TypedSymbol, cast_func, collate_types, create_type, get_base_type,
+    get_type_of_expression, pointer_arithmetic_func, reinterpret_cast_func)
 from pystencils.field import AbstractField, Field, FieldType
 from pystencils.kernelparameters import FieldPointerSymbol
 from pystencils.simp.assignment_collection import AssignmentCollection
@@ -630,13 +630,18 @@ def move_constants_before_loop(ast_node):
                    else:
                        target.insert_before(child, child_to_insert_before)
                elif exists_already and exists_already.rhs == child.rhs:
-                    pass
+                    if target.args.index(exists_already) > target.args.index(child_to_insert_before):
+                        assert target.args.count(exists_already) == 1
+                        assert target.args.count(child_to_insert_before) == 1
+                        target.args.remove(exists_already)
+                        target.insert_before(exists_already, child_to_insert_before)
                else:
                    # this variable already exists in outer block, but with different rhs
                    # -> symbol has to be renamed
                    assert isinstance(child.lhs, TypedSymbol)
                    new_symbol = TypedSymbol(sp.Dummy().name, child.lhs.dtype)
-                    target.insert_before(ast.SympyAssignment(new_symbol, child.rhs), child_to_insert_before)
+                    target.insert_before(ast.SympyAssignment(new_symbol, child.rhs, is_const=child.is_const),
+                                         child_to_insert_before)
                    substitute_variables[child.lhs] = new_symbol


@@ -846,7 +851,7 @@ class KernelConstraintsCheck:
            return cast_func(
                self.process_expression(rhs.args[0], type_constants=False),
                rhs.dtype)
-        elif isinstance(rhs, sp.boolalg.BooleanFunction) or \
+        elif isinstance(rhs, BooleanFunction) or \
                type(rhs) in pystencils.integer_functions.__dict__.values():
            new_args = [self.process_expression(a, type_constants) for a in rhs.args]
            types_of_expressions = [get_type_of_expression(a) for a in new_args]
@@ -1025,7 +1030,7 @@ def insert_casts(node):
        types = [get_type_of_expression(arg) for arg in args]
        assert len(types) > 0
        # Never ever, ever collate to float type for boolean functions!
-        target = collate_types(types, forbid_collation_to_float=isinstance(node.func, sp.boolalg.BooleanFunction))
+        target = collate_types(types, forbid_collation_to_float=isinstance(node.func, BooleanFunction))
        zipped = list(zip(args, types))
        if target.func is PointerType:
            assert node.func is sp.Add
@@ -1064,15 +1069,19 @@ def insert_casts(node):
    return node.func(*args)


-def remove_conditionals_in_staggered_kernel(function_node: ast.KernelFunction) -> None:
-    """Removes conditionals of a kernel that iterates over staggered positions by splitting the loops at last element"""
+def remove_conditionals_in_staggered_kernel(function_node: ast.KernelFunction, include_first=True) -> None:
+    """Removes conditionals of a kernel that iterates over staggered positions by splitting the loops at last or
+       first and last element"""

    all_inner_loops = [l for l in function_node.atoms(ast.LoopOverCoordinate) if l.is_innermost_loop]
    assert len(all_inner_loops) == 1, "Transformation works only on kernels with exactly one inner loop"
    inner_loop = all_inner_loops.pop()

    for loop in parents_of_type(inner_loop, ast.LoopOverCoordinate, include_current=True):
-        cut_loop(loop, [loop.stop - 1])
+        if include_first:
+            cut_loop(loop, [loop.start + 1, loop.stop - 1])
+        else:
+            cut_loop(loop, [loop.stop - 1])

    simplify_conditionals(function_node.body, loop_counter_simplification=True)
    cleanup_blocks(function_node.body)

--- a/pystencils_tests/test_blocking_staggered.py
+++ b/pystencils_tests/test_blocking_staggered.py
@@ -11,8 +11,9 @@ def test_blocking_staggered():
       f[0, 0, 0] - f[0, -1, 0],
       f[0, 0, 0] - f[0, 0, -1],
    ]
-    kernel = ps.create_staggered_kernel(stag, terms, cpu_blocking=(3, 16, 8)).compile()
-    reference_kernel = ps.create_staggered_kernel(stag, terms).compile()
+    assignments = [ps.Assignment(stag.staggered_access(d), terms[i]) for i, d in enumerate(stag.staggered_stencil)]
+    kernel = ps.create_staggered_kernel(assignments, cpu_blocking=(3, 16, 8)).compile()
+    reference_kernel = ps.create_staggered_kernel(assignments).compile()
    print(ps.show_code(kernel.ast))

    f_arr = np.random.rand(80, 33, 19)

--- a/pystencils_tests/test_cuda_known_functions.py
+++ b/pystencils_tests/test_cuda_known_functions.py
@@ -32,8 +32,7 @@ def test_cuda_but_not_c():
    })

    ast = pystencils.create_kernel(assignments, 'cpu')
-    code = str(pystencils.show_code(ast))
-    assert "Not supported" in code
+    print(pystencils.show_code(ast))


 def test_cuda_unknown():
@@ -46,4 +45,3 @@ def test_cuda_unknown():
    ast = pystencils.create_kernel(assignments, 'gpu')
    code = str(pystencils.show_code(ast))
    print(code)
-    assert "Not supported in CUDA" in code
--- a/pystencils_tests/test_fd_derivation.ipynb
+++ b/pystencils_tests/test_fd_derivation.ipynb
--- a/pystencils_tests/test_field.py
+++ b/pystencils_tests/test_field.py
@@ -28,6 +28,9 @@ def test_field_basic():
    assert neighbor.offsets == (-1, 1)
    assert '_' in neighbor._latex('dummy')

+    f = Field.create_fixed_size('f', (8, 8, 2, 2, 2), index_dimensions=3)
+    assert f.center_vector == sp.Matrix([[[f(i, j, k) for k in range(2)] for j in range(2)] for i in range(2)])
+
    f = Field.create_generic('f', spatial_dimensions=5, index_dimensions=2)
    field_access = f[1, -1, 2, -3, 0](1, 0)
    assert field_access.offsets == (1, -1, 2, -3, 0)
@@ -135,17 +138,30 @@ def test_staggered():
    j1, j2, j3 = ps.fields('j1(2), j2(2,2), j3(2,2,2) : double[2D]', field_type=FieldType.STAGGERED)

    assert j1[0, 1](1) == j1.staggered_access((0, sp.Rational(1, 2)))
+    assert j1[1, 1](1) == j1.staggered_access((1, sp.Rational(1, 2)))
+    assert j1[0, 2](1) == j1.staggered_access((0, sp.Rational(3, 2)))
    assert j1[0, 1](1) == j1.staggered_access("N")
    assert j1[0, 0](1) == j1.staggered_access("S")
+    assert j1.staggered_vector_access("N") == sp.Matrix([j1.staggered_access("N")])

    assert j2[0, 1](1, 1) == j2.staggered_access((0, sp.Rational(1, 2)), 1)
    assert j2[0, 1](1, 1) == j2.staggered_access("N", 1)
+    assert j2.staggered_vector_access("N") == sp.Matrix([j2.staggered_access("N", 0), j2.staggered_access("N", 1)])

    assert j3[0, 1](1, 1, 1) == j3.staggered_access((0, sp.Rational(1, 2)), (1, 1))
    assert j3[0, 1](1, 1, 1) == j3.staggered_access("N", (1, 1))
+    assert j3.staggered_vector_access("N") == sp.Matrix([[j3.staggered_access("N", (i, j))
+                                                        for j in range(2)] for i in range(2)])

    # D2Q9
    k = ps.fields('k(4) : double[2D]', field_type=FieldType.STAGGERED)

    assert k[1, 1](2) == k.staggered_access("NE")
    assert k[0, 0](2) == k.staggered_access("SW")
+
+    assert k[0, 0](3) == k.staggered_access("NW")
+
+    # sign reversed when using as flux field
+    r = ps.fields('r(2) : double[2D]', field_type=FieldType.STAGGERED_FLUX)
+    assert r[0, 0](0) == r.staggered_access("W")
+    assert -r[1, 0](0) == r.staggered_access("E")
--- a/pystencils_tests/test_loop_cutting.py
+++ b/pystencils_tests/test_loop_cutting.py
@@ -55,7 +55,8 @@ def test_staggered_iteration():
        for d in range(dim):
            expressions.append(sum(f[o] for o in offsets_in_plane(d, 0, dim)) -
                               sum(f[o] for o in offsets_in_plane(d, -1, dim)))
-        func_optimized = create_staggered_kernel(s, expressions).compile()
+        assignments = [ps.Assignment(s.staggered_access(d), expressions[i]) for i, d in enumerate(s.staggered_stencil)]
+        func_optimized = create_staggered_kernel(assignments).compile()
        assert not func_optimized.ast.atoms(Conditional), "Loop cutting optimization did not work"

        func(f=f_arr, s=s_arr_ref)
@@ -111,8 +112,10 @@ def test_staggered_gpu():
    s = ps.fields("s({dim}): double[{dim}D]".format(dim=dim), field_type=FieldType.STAGGERED)
    expressions = [(f[0, 0] + f[-1, 0]) / 2,
                   (f[0, 0] + f[0, -1]) / 2]
-    kernel_ast = ps.create_staggered_kernel(s, expressions, target='gpu', gpu_exclusive_conditions=True)
+    assignments = [ps.Assignment(s.staggered_access(d), expressions[i]) for i, d in enumerate(s.staggered_stencil)]
+    kernel_ast = ps.create_staggered_kernel(assignments, target='gpu', gpu_exclusive_conditions=True)
    assert len(kernel_ast.atoms(Conditional)) == 4

-    kernel_ast = ps.create_staggered_kernel(s, expressions, target='gpu', gpu_exclusive_conditions=False)
+    assignments = [ps.Assignment(s.staggered_access(d), expressions[i]) for i, d in enumerate(s.staggered_stencil)]
+    kernel_ast = ps.create_staggered_kernel(assignments, target='gpu', gpu_exclusive_conditions=False)
    assert len(kernel_ast.atoms(Conditional)) == 3
--- a/pystencils_tests/test_staggered_kernel.py
+++ b/pystencils_tests/test_staggered_kernel.py
+import pystencils as ps
+import numpy as np
+import sympy as sp
+
+
+class TestStaggeredDiffusion:
+    def _run(self, num_neighbors):
+        L = (40, 40)
+        D = 0.066
+        dt = 1
+        T = 100
+
+        dh = ps.create_data_handling(L, periodicity=True, default_target='cpu')
+
+        c = dh.add_array('c', values_per_cell=1)
+        j = dh.add_array('j', values_per_cell=num_neighbors, field_type=ps.FieldType.STAGGERED_FLUX)
+
+        x_staggered = - c[-1, 0] + c[0, 0]
+        y_staggered = - c[0, -1] + c[0, 0]
+        xy_staggered = - c[-1, -1] + c[0, 0]
+        xY_staggered = - c[-1, 1] + c[0, 0]
+
+        jj = j.staggered_access
+        divergence = -1 * D / (1 + sp.sqrt(2) if j.index_shape[0] == 4 else 1) * \
+            sum([jj(d) / sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm() for d in j.staggered_stencil
+                + [ps.stencil.inverse_direction_string(d) for d in j.staggered_stencil]])
+
+        update = [ps.Assignment(c.center, c.center + dt * divergence)]
+        flux = [ps.Assignment(j.staggered_access("W"), x_staggered),
+                ps.Assignment(j.staggered_access("S"), y_staggered)]
+        if j.index_shape[0] == 4:
+            flux += [ps.Assignment(j.staggered_access("SW"), xy_staggered),
+                     ps.Assignment(j.staggered_access("NW"), xY_staggered)]
+
+        staggered_kernel = ps.create_staggered_kernel(flux, target=dh.default_target).compile()
+        div_kernel = ps.create_kernel(update, target=dh.default_target).compile()
+
+        def time_loop(steps):
+            sync = dh.synchronization_function([c.name])
+            dh.all_to_gpu()
+            for i in range(steps):
+                sync()
+                dh.run_kernel(staggered_kernel)
+                dh.run_kernel(div_kernel)
+            dh.all_to_cpu()
+
+        def init():
+            dh.fill(c.name, np.nan, ghost_layers=True, inner_ghost_layers=True)
+            dh.fill(c.name, 0)
+            dh.fill(j.name, np.nan, ghost_layers=True, inner_ghost_layers=True)
+            dh.cpu_arrays[c.name][L[0] // 2:L[0] // 2 + 2, L[1] // 2:L[1] // 2 + 2] = 1.0
+
+        init()
+        time_loop(T)
+
+        reference = np.empty(L)
+        for x in range(L[0]):
+            for y in range(L[1]):
+                r = np.array([x, y]) - L[0] / 2 + 0.5
+                reference[x, y] = (4 * np.pi * D * T)**(-dh.dim / 2) * np.exp(-np.dot(r, r) / (4 * D * T)) * (2**dh.dim)
+
+        assert np.abs(dh.gather_array(c.name) - reference).max() < 5e-4
+
+    def test_diffusion_2(self):
+        self._run(2)
+
+    def test_diffusion_4(self):
+        self._run(4)
+
+
+def test_staggered_subexpressions():
+    dh = ps.create_data_handling((10, 10), periodicity=True, default_target='cpu')
+    j = dh.add_array('j', values_per_cell=2, field_type=ps.FieldType.STAGGERED)
+    c = sp.symbols("c")
+    assignments = [ps.Assignment(j.staggered_access("W"), c),
+                   ps.Assignment(c, 1)]
+    ps.create_staggered_kernel(assignments, target=dh.default_target).compile()
+
+
+def test_staggered_loop_cutting():
+    dh = ps.create_data_handling((4, 4), periodicity=True, default_target='cpu')
+    j = dh.add_array('j', values_per_cell=4, field_type=ps.FieldType.STAGGERED)
+    assignments = [ps.Assignment(j.staggered_access("SW"), 1)]
+    ast = ps.create_staggered_kernel(assignments, target=dh.default_target)
+    assert not ast.atoms(ps.astnodes.Conditional)
--- a/pystencils_tests/test_types.py
+++ b/pystencils_tests/test_types.py
 import sympy as sp
-
+import numpy as np
+import pystencils as ps
 from pystencils import data_types
-from pystencils.data_types import *
-from pystencils.kernelparameters import FieldShapeSymbol
+from pystencils.data_types import TypedSymbol, get_type_of_expression, VectorType, collate_types, create_type


 def test_parsing():
@@ -25,7 +25,6 @@ def test_collation():


 def test_dtype_of_constants():
-
    # Some come constants are neither of type Integer,Float,Rational and don't have args
    # >>> isinstance(pi, Integer)
    # False
@@ -39,13 +38,25 @@ def test_dtype_of_constants():


 def test_assumptions():
-
-    x = pystencils.fields('x:  float32[3d]')
+    x = ps.fields('x:  float32[3d]')

    assert x.shape[0].is_nonnegative
    assert (2 * x.shape[0]).is_nonnegative
    assert (2 * x.shape[0]).is_integer
-    assert(TypedSymbol('a', create_type('uint64'))).is_nonnegative
+    assert (TypedSymbol('a', create_type('uint64'))).is_nonnegative
    assert (TypedSymbol('a', create_type('uint64'))).is_positive is None
    assert (TypedSymbol('a', create_type('uint64')) + 1).is_positive
    assert (x.shape[0] + 1).is_real
+
+
+def test_sqrt_of_integer():
+    """Regression test for bug where sqrt(3) was classified as integer"""
+    f = ps.fields("f: [1D]")
+    tmp = sp.symbols("tmp")
+
+    assignments = [ps.Assignment(tmp, sp.sqrt(3)),
+                   ps.Assignment(f[0], tmp)]
+    arr = np.array([1], dtype=np.float64)
+    kernel = ps.create_kernel(assignments).compile()
+    kernel(f=arr)
+    assert 1.7 < arr[0] < 1.8
No results found