Compare revisions

Stephan Seitz · Stephan Seitz · Stephan Seitz · Stephan Seitz · Stephan Seitz · Stephan Seitz
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "pystencils/gpucuda/CubicInterpolationCUDA"]
+	path = pystencils/gpucuda/CubicInterpolationCUDA
+	url = https://github.com/theHamsta/CubicInterpolationCUDA.git
--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
 from . import sympy_gmpy_bug_workaround  # NOQA
-from . import fd
 from . import stencil as stencil
-from .assignment import Assignment, assignment_from_stencil
-from .data_types import TypedSymbol
-from .datahandling import create_data_handling
-from .display_utils import show_code, to_dot
 from .field import Field, FieldType, fields
-from .kernel_decorator import kernel
-from .kernelcreation import create_indexed_kernel, create_kernel, create_staggered_kernel
-from .simp import AssignmentCollection
+from .data_types import TypedSymbol, matrix_symbols, typed_symbols
 from .slicing import make_slice
+from .kernelcreation import create_kernel, create_indexed_kernel, create_staggered_kernel
+from .display_utils import show_code, to_dot
+from .simp import AssignmentCollection
+from .assignment import Assignment, assignment_from_stencil
 from .sympyextensions import SymbolCreator
+from .datahandling import create_data_handling
+from .kernel_decorator import kernel
+from . import fd
+from .spatial_coordinates import (x_, x_staggered, x_staggered_vector, x_vector,
+                                  y_, y_staggered, z_, z_staggered)

 __all__ = ['Field', 'FieldType', 'fields',
           'TypedSymbol',
@@ -25,4 +27,8 @@ __all__ = ['Field', 'FieldType', 'fields',
           'create_data_handling',
           'kernel',
           'fd',
+           'x_', 'y_', 'z_',
+           'x_staggered', 'y_staggered', 'z_staggered',
+           'x_vector', 'x_staggered_vector',
+           'typed_symbols', 'matrix_symbols',
           'stencil']
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
-import uuid
+import collections
 from typing import Any, List, Optional, Sequence, Set, Union

 import sympy as sp
@@ -33,7 +33,7 @@ class Node:
        raise NotImplementedError()

    def subs(self, subs_dict) -> None:
-        """Inplace! substitute, similar to sympy's but modifies the AST inplace."""
+        """Inplace! Substitute, similar to sympy's but modifies the AST inplace."""
        for a in self.args:
            a.subs(subs_dict)

@@ -102,7 +102,8 @@ class Conditional(Node):
        result = self.true_block.undefined_symbols
        if self.false_block:
            result.update(self.false_block.undefined_symbols)
-        result.update(self.condition_expr.atoms(sp.Symbol))
+        if hasattr(self.condition_expr, 'atoms'):
+            result.update(self.condition_expr.atoms(sp.Symbol))
        return result

    def __str__(self):
@@ -227,7 +228,8 @@ class KernelFunction(Node):
            return ()

        argument_symbols = self._body.undefined_symbols - self.global_variables
-        parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
+        parameters = [self.Parameter(symbol, get_fields(symbol))
+                      for symbol in argument_symbols]
        if hasattr(self, 'indexing'):
            parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
        parameters.sort(key=lambda p: p.symbol.name)
@@ -279,8 +281,15 @@ class Block(Node):
            a.subs(subs_dict)

    def insert_front(self, node):
-        node.parent = self
-        self._nodes.insert(0, node)
+        if isinstance(node, collections.Iterable):
+            node = list(node)
+            for n in node:
+                n.parent = self
+
+            self._nodes = node + self._nodes
+        else:
+            node.parent = self
+            self._nodes.insert(0, node)

    def insert_before(self, new_node, insert_before):
        new_node.parent = self
@@ -481,7 +490,7 @@ class SympyAssignment(Node):
    def __init__(self, lhs_symbol, rhs_expr, is_const=True):
        super(SympyAssignment, self).__init__(parent=None)
        self._lhs_symbol = lhs_symbol
-        self.rhs = rhs_expr
+        self.rhs = sp.simplify(rhs_expr)
        self._is_const = is_const
        self._is_declaration = self.__is_declaration()

@@ -517,7 +526,7 @@ class SympyAssignment(Node):

    @property
    def undefined_symbols(self):
-        result = self.rhs.atoms(sp.Symbol)
+        result = {s for s in self.rhs.free_symbols if not isinstance(s, sp.Indexed)}
        # Add loop counters if there a field accesses
        loop_counters = set()
        for symbol in result:
@@ -718,5 +727,48 @@ class DestructuringBindingsForFieldClass(Node):
        return self.body.atoms(arg_type) | {s for s in self.symbols_defined if isinstance(s, arg_type)}


-def get_dummy_symbol(dtype='bool'):
-    return TypedSymbol('dummy%s' % uuid.uuid4().hex, create_type(dtype))
+class SourceCodeComment(Node):
+    def __init__(self, text):
+        self.text = text
+
+    @property
+    def args(self):
+        return []
+
+    @property
+    def symbols_defined(self):
+        return set()
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    def __str__(self):
+        return "/* " + self.text + " */"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class EmptyLine(Node):
+
+    def __init__(self):
+        pass
+
+    @property
+    def args(self):
+        return []
+
+    @property
+    def symbols_defined(self):
+        return set()
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    def __str__(self):
+        return ""
+
+    def __repr__(self):
+        return self.__str__()
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -128,6 +128,12 @@ class CustomCodeNode(Node):
    def undefined_symbols(self):
        return self._symbols_read - self._symbols_defined

+    def __eq___(self, other):
+        return self._code == other._code
+
+    def __hash__(self):
+        return hash(self._code)
+

 class PrintNode(CustomCodeNode):
    # noinspection SpellCheckingInspection
@@ -172,7 +178,8 @@ class CBackend:
        raise NotImplementedError(self.__class__ + " does not support node of type " + str(type(node)))

    def _print_KernelFunction(self, node):
-        function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()]
+        parameters = node.get_parameters()
+        function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in parameters]
        launch_bounds = ""
        if self.__class__ == 'cuda':
            max_threads = node.indexing.max_threads_per_block()
@@ -251,6 +258,12 @@ class CBackend:
    def _print_CustomCodeNode(self, node):
        return node.get_code(self._dialect, self._vector_instruction_set)

+    def _print_SourceCodeComment(self, node):
+        return "/* " + node.text + " */"
+
+    def _print_EmptyLine(self, node):
+        return ""
+
    def _print_Conditional(self, node):
        cond_type = get_type_of_expression(node.condition_expr)
        if isinstance(cond_type, VectorType):
@@ -366,10 +379,59 @@ class CustomSympyPrinter(CCodePrinter):
                    res += ".0f"
                else:
                    res += "f"
+            else:
+                if '.' not in res:
+                    res += "."
            return res
        else:
            return res

+    def _print_Sum(self, expr):
+        template = """[&]() {{
+    {dtype} sum = ({dtype}) 0;
+    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
+        sum += {expr};
+    }}
+    return sum;
+}}()"""
+        var = expr.limits[0][0]
+        start = expr.limits[0][1]
+        end = expr.limits[0][2]
+        code = template.format(
+            dtype=get_type_of_expression(expr.args[0]),
+            iterator_dtype='int',
+            var=self._print(var),
+            start=self._print(start),
+            end=self._print(end),
+            expr=self._print(expr.function),
+            increment=str(1),
+            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
+        )
+        return code
+
+    def _print_Product(self, expr):
+        template = """[&]() {{
+    {dtype} product = ({dtype}) 1;
+    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
+        product *= {expr};
+    }}
+    return product;
+}}()"""
+        var = expr.limits[0][0]
+        start = expr.limits[0][1]
+        end = expr.limits[0][2]
+        code = template.format(
+            dtype=get_type_of_expression(expr.args[0]),
+            iterator_dtype='int',
+            var=self._print(var),
+            start=self._print(start),
+            end=self._print(end),
+            expr=self._print(expr.function),
+            increment=str(1),
+            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
+        )
+        return code
+
    _print_Max = C89CodePrinter._print_Max
    _print_Min = C89CodePrinter._print_Min


--- a/pystencils/backends/cuda_backend.py
+++ b/pystencils/backends/cuda_backend.py
 from os.path import dirname, join

+import numpy as np
+import sympy
+from sympy.core.compatibility import string_types
+
 from pystencils.astnodes import Node
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
+from pystencils.data_types import cast_func, create_type, get_type_of_expression
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
+from pystencils.interpolation_astnodes import InterpolationMode

 with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
    lines = f.readlines()
@@ -43,11 +49,19 @@ class CudaBackend(CBackend):
        return code

    def _print_TextureDeclaration(self, node):
-        code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
-            str(node.texture.field.dtype),
-            node.texture.field.spatial_dimensions,
-            node.texture
-        )
+
+        if node.texture.field.dtype.numpy_dtype.itemsize > 4:
+            code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
+                str(node.texture.field.dtype),
+                node.texture.field.spatial_dimensions,
+                node.texture
+            )
+        else:
+            code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
+                str(node.texture.field.dtype),
+                node.texture.field.spatial_dimensions,
+                node.texture
+            )
        return code

    def _print_SkipIteration(self, _):
@@ -62,17 +76,23 @@ class CudaSympyPrinter(CustomSympyPrinter):
        self.known_functions.update(CUDA_KNOWN_FUNCTIONS)

    def _print_TextureAccess(self, node):
+        dtype = node.texture.field.dtype.numpy_dtype

-        if node.texture.cubic_bspline_interpolation:
+        if node.texture.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
            template = "cubicTex%iDSimple<%s>(%s, %s)"
        else:
-            template = "tex%iD<%s>(%s, %s)"
+            if dtype.itemsize > 4:
+                # Use PyCuda hack!
+                # https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
+                template = "fp_tex%iD(%s, %s)"
+            else:
+                template = "tex%iD(%s, %s)"

        code = template % (
            node.texture.field.spatial_dimensions,
-            str(node.texture.field.dtype),
            str(node.texture),
-            ', '.join(self._print(o) for o in node.offsets)
+            # + 0.5 comes from Nvidia's staggered indexing
+            ', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
        )
        return code

@@ -84,3 +104,78 @@ class CudaSympyPrinter(CustomSympyPrinter):
        elif isinstance(expr, fast_inv_sqrt):
            return "__frsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args)
        return super()._print_Function(expr)
+
+    def _print_Pow(self, expr):
+        if np.issubdtype(get_type_of_expression(expr.args[0],
+                                                default_float_type=self._float_type).numpy_dtype, np.integer):
+            expr._args[0] = cast_func(expr.args[0], create_type(self._float_type))
+
+        if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
+            return "(" + self._print(sympy.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")"
+        elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
+            return "1 / ({})".format(self._print(sympy.Mul(*[expr.base] * (-expr.exp), evaluate=False)))
+        else:
+
+            try:
+                suffix = self._get_func_suffix(get_type_of_expression(
+                    expr.args[0], default_float_type=self._float_type).sympy_dtype)
+            except Exception:
+                suffix = ""
+            if expr.exp == -1:
+                return '1.0%s/%s' % (suffix.upper(), self.parenthesize(expr.base, sympy.PREC))
+            elif expr.exp == 0.5:
+                return '%ssqrt%s(%s)' % (self._ns, suffix, self._print(expr.base))
+            elif expr.exp == sympy.S.One / 3 and self.standard != 'C89':
+                return '%scbrt%s(%s)' % (self._ns, suffix, self._print(expr.base))
+            else:
+                return '%spow%s(%s, %s)' % (self._ns, suffix, self._print(expr.base),
+                                            self._print(expr.exp))
+
+    def _print_math_func(self, expr, nest=False, known=None):
+        if np.issubdtype(get_type_of_expression(expr.args[0],
+                                                default_float_type=self._float_type).numpy_dtype, np.integer):
+            expr._args = (cast_func(expr.args[0], self._float_type), *expr.args[1:])
+
+        if len(expr.args) == 2 and \
+                (get_type_of_expression(expr.args[0], self._float_type) !=  # NOQA
+                 get_type_of_expression(expr.args[1], default_float_type=self._float_type)):
+            expr._args = (expr.args[0], cast_func(expr.args[1], get_type_of_expression(
+                expr.args[0], default_float_type=self._float_type)))
+
+        if known is None:
+            known = self.known_functions[expr.__class__.__name__]
+        if not isinstance(known, string_types):
+            for cb, name in known:
+                if cb(*expr.args):
+                    known = name
+                    break
+            else:
+                raise ValueError("No matching printer")
+        suffix = self._get_func_suffix(
+            get_type_of_expression(expr.args[0], default_float_type=self._float_type
+                                   ).sympy_dtype) if self._ns + known in self._prec_funcs else ''
+
+        if nest:
+            args = self._print(expr.args[0])
+            if len(expr.args) > 1:
+                paren_pile = ''
+                for curr_arg in expr.args[1:-1]:
+                    paren_pile += ')'
+                    args += ', {ns}{name}{suffix}({next}'.format(
+                        ns=self._ns,
+                        name=known,
+                        suffix=suffix,
+                        next=self._print(curr_arg)
+                    )
+                args += ', %s%s' % (
+                    self._print(expr.func(expr.args[-1])),
+                    paren_pile
+                )
+        else:
+            args = ', '.join(map(lambda arg: self._print(arg), expr.args))
+        return '{ns}{name}{suffix}({args})'.format(
+            ns=self._ns,
+            name=known,
+            suffix=suffix,
+            args=args
+        )
--- a/pystencils/backends/cuda_known_functions.txt
+++ b/pystencils/backends/cuda_known_functions.txt
@@ -45,6 +45,7 @@ tex1D
 tex2D
 tex3D

+sqrtf
 rsqrtf
 cbrtf
 rcbrtf

--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
@@ -10,8 +10,8 @@ from pystencils.data_types import BasicType, StructType, TypedSymbol, create_typ
 from pystencils.field import Field, FieldType
 from pystencils.transformations import (
    add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering,
-    make_loop_over_domain, move_constants_before_loop, parse_base_pointer_info,
-    resolve_buffer_accesses, resolve_field_accesses, split_inner_loop)
+    implement_interpolations, make_loop_over_domain, move_constants_before_loop,
+    parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, split_inner_loop)

 AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]]

@@ -67,6 +67,7 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
                                                        ghost_layers=ghost_layers, loop_order=loop_order)
    ast_node = KernelFunction(loop_node, 'cpu', 'c', compile_function=make_python_function,
                              ghost_layers=ghost_layer_info, function_name=function_name)
+    implement_interpolations(body)

    if split_groups:
        typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
@@ -139,6 +140,8 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
    loop_body = Block([])
    loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0])

+    implement_interpolations(loop_node)
+
    for assignment in assignments:
        loop_body.append(assignment)


--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
 import ctypes
+import uuid
+from typing import Tuple

 import numpy as np
 import sympy as sp
+import sympy.codegen.ast
 from sympy.core.cache import cacheit
 from sympy.logic.boolalg import Boolean

+import pystencils
 from pystencils.cache import memorycache
 from pystencils.utils import all_equal

@@ -15,6 +19,30 @@ except ImportError as e:
    _ir_importerror = e


+def typed_symbols(names, dtype, *args):
+    symbols = sp.symbols(names, *args)
+    if isinstance(symbols, Tuple):
+        return tuple(TypedSymbol(str(s), dtype) for s in symbols)
+    else:
+        return TypedSymbol(str(symbols), dtype)
+
+
+def matrix_symbols(names, dtype, rows, cols):
+    if isinstance(names, str):
+        names = names.replace(' ', '').split(',')
+
+    matrices = []
+    for n in names:
+        symbols = typed_symbols("%s:%i" % (n, rows * cols), dtype)
+        matrices.append(sp.Matrix(rows, cols, lambda i, j: symbols[i * cols + j]))
+
+    return tuple(matrices)
+
+
+def get_dummy_symbol(dtype=None):
+    return pystencils.typed_symbols('dummy%s' % uuid.uuid4().hex, dtype or create_type('bool'))
+
+
 # noinspection PyPep8Naming
 class address_of(sp.Function):
    is_Atom = True
@@ -329,12 +357,17 @@ def peel_off_type(dtype, type_to_peel_off):
    return dtype


-def collate_types(types):
+def collate_types(types, forbid_collation_to_float=False):
    """
    Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double
    Uses the collation rules from numpy.
    """

+    if forbid_collation_to_float:
+        types = [t for t in types if not (hasattr(t, 'is_float') and t.is_float())]
+        if not types:
+            return [create_type('int32')]
+
    # Pointer arithmetic case i.e. pointer + integer is allowed
    if any(type(t) is PointerType for t in types):
        pointer_type = None
@@ -370,17 +403,19 @@ def collate_types(types):


 @memorycache(maxsize=2048)
-def get_type_of_expression(expr):
+def get_type_of_expression(expr, default_float_type="double", default_int_type='int'):
    from pystencils.astnodes import ResolvedFieldAccess
    from pystencils.cpu.vectorization import vec_all, vec_any

    expr = sp.sympify(expr)
    if isinstance(expr, sp.Integer):
-        return create_type("int")
+        return create_type(default_int_type)
    elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float):
-        return create_type("double")
+        return create_type(default_float_type)
    elif isinstance(expr, ResolvedFieldAccess):
        return expr.field.dtype
+    elif isinstance(expr, pystencils.field.Field.AbstractAccess):
+        return expr.field.dtype
    elif isinstance(expr, TypedSymbol):
        return expr.dtype
    elif isinstance(expr, sp.Symbol):
@@ -401,16 +436,23 @@ def get_type_of_expression(expr):
    elif isinstance(expr, sp.boolalg.Boolean) or isinstance(expr, sp.boolalg.BooleanFunction):
        # if any arg is of vector type return a vector boolean, else return a normal scalar boolean
        result = create_type("bool")
-        vec_args = [get_type_of_expression(a) for a in expr.args if isinstance(get_type_of_expression(a), VectorType)]
+        vec_args = [get_type_of_expression(a)
+                    for a in expr.args if isinstance(get_type_of_expression(a), VectorType)]
        if vec_args:
            result = VectorType(result, width=vec_args[0].width)
        return result
    elif isinstance(expr, sp.Pow):
        return get_type_of_expression(expr.args[0])
    elif isinstance(expr, sp.Expr):
-        types = tuple(get_type_of_expression(a) for a in expr.args)
-        return collate_types(types)
-
+        expr: sp.Expr
+        if expr.args:
+            types = tuple(get_type_of_expression(a) for a in expr.args)
+            return collate_types(types)
+        else:
+            if expr.is_integer:
+                return create_type(default_int_type)
+            else:
+                return create_type(default_float_type)
    raise NotImplementedError("Could not determine type for", expr, type(expr))


@@ -463,6 +505,10 @@ class BasicType(Type):
    def numpy_dtype(self):
        return self._dtype

+    @property
+    def sympy_dtype(self):
+        return getattr(sympy.codegen.ast, str(self.numpy_dtype))
+
    @property
    def item_size(self):
        return 1

--- a/pystencils/field.py
+++ b/pystencils/field.py
+import functools
 import hashlib
+import operator
 import pickle
 import re
 from enum import Enum
@@ -9,6 +11,7 @@ import numpy as np
 import sympy as sp
 from sympy.core.cache import cacheit

+import pystencils
 from pystencils.alignedarray import aligned_empty
 from pystencils.data_types import StructType, TypedSymbol, create_type
 from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol
@@ -38,7 +41,6 @@ def fields(description=None, index_dimensions=0, layout=None, **kwargs):
            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
            >>> assert v.index_shape == (2,)

-
        Format string can be left out, field names are taken from keyword arguments.
            >>> fields(f1=arr_s, f2=arr_s)
            [f1, f2]
@@ -292,6 +294,10 @@ class Field(AbstractField):
        self.shape = shape
        self.strides = strides
        self.latex_name = None  # type: Optional[str]
+        self.coordinate_origin = sp.Matrix(tuple(
+            0 for _ in range(self.spatial_dimensions)
+        ))  # type: tuple[float,sp.Symbol]
+        self.coordinate_transform = sp.eye(self.spatial_dimensions)

    def new_field_with_different_name(self, new_name):
        if self.has_fixed_shape:
@@ -312,6 +318,9 @@ class Field(AbstractField):
    def ndim(self) -> int:
        return len(self.shape)

+    def values_per_cell(self) -> int:
+        return functools.reduce(operator.mul, self.index_shape, 1)
+
    @property
    def layout(self):
        return self._layout
@@ -409,6 +418,34 @@ class Field(AbstractField):
            return False
        return self.hashable_contents() == other.hashable_contents()

+    @property
+    def physical_coordinates(self):
+        return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
+
+    @property
+    def physical_coordinates_staggered(self):
+        return self.coordinate_transform @ \
+            (self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions))
+
+    def index_to_physical(self, index_coordinates, staggered=False):
+        if staggered:
+            index_coordinates = sp.Matrix([i + 0.5 for i in index_coordinates])
+        return self.coordinate_transform @ (self.coordinate_origin + index_coordinates)
+
+    def physical_to_index(self, physical_coordinates, staggered=False):
+        rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin
+        if staggered:
+            rtn = sp.Matrix([i - 0.5 for i in rtn])
+
+        return rtn
+
+    def index_to_staggered_physical_coordinates(self, symbol_vector):
+        symbol_vector += sp.Matrix([0.5] * self.spatial_dimensions)
+        return self.create_physical_coordinates(symbol_vector)
+
+    def set_coordinate_origin_to_field_center(self):
+        self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
+
    # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
    class Access(TypedSymbol, AbstractField.AbstractAccess):
        """Class representing a relative access into a `Field`.
@@ -429,11 +466,12 @@ class Field(AbstractField):
            >>> central_y_component.at_index(0)  # change component
            v_C^0
        """
+
        def __new__(cls, name, *args, **kwargs):
            obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
            return obj

-        def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False):
+        def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False, dtype=None):
            field_name = field.name
            offsets_and_index = (*offsets, *idx) if idx is not None else offsets
            constant_offsets = not any([isinstance(o, sp.Basic) and not o.is_Integer for o in offsets_and_index])
@@ -484,7 +522,7 @@ class Field(AbstractField):
            return obj

        def __getnewargs__(self):
-            return self.field, self.offsets, self.index, self.is_absolute_access
+            return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype

        # noinspection SpellCheckingInspection
        __xnew__ = staticmethod(__new_stage2__)
@@ -503,7 +541,7 @@ class Field(AbstractField):
            if len(idx) != self.field.index_dimensions:
                raise ValueError("Wrong number of indices: "
                                 "Got %d, expected %d" % (len(idx), self.field.index_dimensions))
-            return Field.Access(self.field, self._offsets, idx)
+            return Field.Access(self.field, self._offsets, idx, dtype=self.dtype)

        def __getitem__(self, *idx):
            return self.__call__(*idx)
@@ -562,7 +600,7 @@ class Field(AbstractField):
            """
            offset_list = list(self.offsets)
            offset_list[coord_id] += offset
-            return Field.Access(self.field, tuple(offset_list), self.index)
+            return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype)

        def get_shifted(self, *shift) -> 'Field.Access':
            """Returns a new Access with changed spatial coordinates
@@ -572,7 +610,10 @@ class Field(AbstractField):
                >>> f[0,0].get_shifted(1, 1)
                f_NE
            """
-            return Field.Access(self.field, tuple(a + b for a, b in zip(shift, self.offsets)), self.index)
+            return Field.Access(self.field,
+                                tuple(a + b for a, b in zip(shift, self.offsets)),
+                                self.index,
+                                dtype=self.dtype)

        def at_index(self, *idx_tuple) -> 'Field.Access':
            """Returns new Access with changed index.
@@ -582,7 +623,7 @@ class Field(AbstractField):
                >>> f(0).at_index(8)
                f_C^8
            """
-            return Field.Access(self.field, self.offsets, idx_tuple)
+            return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype)

        @property
        def is_absolute_access(self) -> bool:

--- a/CubicInterpolationCUDA @ f36dde8b
+++ b/CubicInterpolationCUDA @ f36dde8b
+Subproject commit f36dde8bede20f0a4bae60cd2675e7f6ede65887
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
+from os.path import dirname, isdir, join
+
 import numpy as np

 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.data_types import StructType
 from pystencils.field import FieldType
+from pystencils.gpucuda.texture_utils import ndarray_to_tex
 from pystencils.include import get_pystencils_include_path
+from pystencils.interpolation_astnodes import InterpolationMode, TextureAccess
 from pystencils.kernelparameters import FieldPointerSymbol

 USE_FAST_MATH = True
@@ -29,17 +33,30 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
    if argument_dict is None:
        argument_dict = {}

-    header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node))
+    header_list = ['<cstdint>', '<pycuda-helper-modified.hpp>'] + list(get_headers(kernel_function_node))
    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])

    code = includes + "\n"
    code += "#define FUNC_PREFIX __global__\n"
    code += "#define RESTRICT __restrict__\n\n"
    code += str(generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend))
-    options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    textures = set(d.texture for d in kernel_function_node.atoms(TextureAccess))
+
+    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
    if USE_FAST_MATH:
-        options.append("-use_fast_math")
-    mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()])
+        nvcc_options.append("-use_fast_math")
+
+    if any(t.interpolation_mode == InterpolationMode.CUBIC_SPLINE for t in textures):
+        assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
+            "Submodule CubicInterpolationCUDA does not exist"
+        nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
+        nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
+
+        needed_dims = set(t.field.spatial_dimensions for t in textures if t.cubic_bspline_interpolation)
+        for i in needed_dims:
+            code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code
+
+    mod = SourceModule(code, options=nvcc_options, include_dirs=[get_pystencils_include_path()])
    func = mod.get_function(kernel_function_node.function_name)

    parameters = kernel_function_node.get_parameters()
@@ -63,6 +80,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])

+            # TODO: use texture objects:
+            # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
+            for tex in textures:
+                tex_ref = mod.get_texref(str(tex))
+                ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
+                               tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
            args = _build_numpy_argument_list(parameters, full_arguments)
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique

--- a/pystencils/gpucuda/kernelcreation.py
+++ b/pystencils/gpucuda/kernelcreation.py
@@ -4,12 +4,18 @@ from pystencils.field import Field, FieldType
 from pystencils.gpucuda.cudajit import make_python_function
 from pystencils.gpucuda.indexing import BlockIndexing
 from pystencils.transformations import (
-    add_types, get_base_buffer_index, get_common_shape, parse_base_pointer_info,
-    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
-
-
-def create_cuda_kernel(assignments, function_name="kernel", type_info=None, indexing_creator=BlockIndexing,
-                       iteration_slice=None, ghost_layers=None, skip_independence_check=False):
+    add_types, get_base_buffer_index, get_common_shape, implement_interpolations,
+    parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
+
+
+def create_cuda_kernel(assignments,
+                       function_name="kernel",
+                       type_info=None,
+                       indexing_creator=BlockIndexing,
+                       iteration_slice=None,
+                       ghost_layers=None,
+                       skip_independence_check=False,
+                       use_textures_for_interpolation=True):
    fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
@@ -57,6 +63,8 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde
    ast = KernelFunction(block, 'gpu', 'gpucuda', make_python_function, ghost_layers, function_name)
    ast.global_variables.update(indexing.index_variables)

+    implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
+
    base_pointer_spec = [['spatialInner0']]
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
                                                         f.spatial_dimensions, f.index_dimensions)
@@ -86,8 +94,13 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde
    return ast


-def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel", type_info=None,
-                                coordinate_names=('x', 'y', 'z'), indexing_creator=BlockIndexing):
+def created_indexed_cuda_kernel(assignments,
+                                index_fields,
+                                function_name="kernel",
+                                type_info=None,
+                                coordinate_names=('x', 'y', 'z'),
+                                indexing_creator=BlockIndexing,
+                                use_textures_for_interpolation=True):
    fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
@@ -125,6 +138,8 @@ def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel
    ast = KernelFunction(function_body, 'gpu', 'gpucuda', make_python_function, None, function_name)
    ast.global_variables.update(indexing.index_variables)

+    implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
+
    coord_mapping = indexing.coordinates
    base_pointer_spec = [['spatialInner0']]
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],

--- a/pystencils/gpucuda/texture_utils.py
+++ b/pystencils/gpucuda/texture_utils.py
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
+#
+# Distributed under terms of the GPLv3 license.
+
+"""
+
+"""
+
+from os.path import dirname, isdir, join
+
+import numpy as np
+
+try:
+    import pycuda.driver as cuda
+    from pycuda import gpuarray
+except Exception:
+    pass
+
+
+def pow_two_divider(n):
+    if n == 0:
+        return 0
+    divider = 1
+    while (n & divider) == 0:
+        divider <<= 1
+    return divider
+
+
+def ndarray_to_tex(tex_ref,
+                   ndarray,
+                   address_mode=None,
+                   filter_mode=None,
+                   use_normalized_coordinates=False,
+                   read_as_integer=False):
+
+    if address_mode is None:
+        address_mode = cuda.address_mode.BORDER
+    if filter_mode is None:
+        filter_mode = cuda.filter_mode.LINEAR
+
+    if isinstance(ndarray, np.ndarray):
+        cu_array = cuda.np_to_array(ndarray, 'C')
+    elif isinstance(ndarray, gpuarray.GPUArray):
+        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
+    else:
+        raise TypeError(
+            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
+
+    cuda.TextureReference.set_array(tex_ref, cu_array)
+
+    tex_ref.set_address_mode(0, address_mode)
+    if ndarray.ndim >= 2:
+        tex_ref.set_address_mode(1, address_mode)
+    if ndarray.ndim >= 3:
+        tex_ref.set_address_mode(2, address_mode)
+    tex_ref.set_filter_mode(filter_mode)
+
+    if not use_normalized_coordinates:
+        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
+
+    if not read_as_integer:
+        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
+
+
+def prefilter_for_cubic_bspline(gpuarray):
+    import pycuda.autoinit  # NOQA
+    from pycuda.compiler import SourceModule
+
+    ndim = gpuarray.ndim
+    assert ndim == 2 or ndim == 3, "Only 2d or 3d supported"
+    assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
+        "Submodule CubicInterpolationCUDA does not exist"
+    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
+    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
+
+    file_name = join(dirname(__file__), "CubicInterpolationCUDA", "code", "cubicPrefilter%iD.cu" % ndim)
+    with open(file_name) as file:
+        code = file.read()
+
+    mod = SourceModule(code, options=nvcc_options)
+
+    if ndim == 2:
+        height, width = gpuarray.shape
+        block = min(pow_two_divider(height), 64)
+        grid = height // block
+        func = mod.get_function('SamplesToCoefficients2DXf')
+        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
+                                                          for r in reversed(gpuarray.shape)),
+             block=(block, 1, 1),
+             grid=(grid, 1, 1))
+
+        block = min(pow_two_divider(width), 64)
+        grid = width // block
+        func = mod.get_function('SamplesToCoefficients2DYf')
+        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
+                                                          for r in reversed(gpuarray.shape)),
+             block=(block, 1, 1),
+             grid=(grid, 1, 1))
+    elif ndim == 3:
+        depth, height, width = gpuarray.shape
+        dimX = min(min(pow_two_divider(width), pow_two_divider(height)), 64)
+        dimY = min(min(pow_two_divider(depth), pow_two_divider(height)), 512 / dimX)
+        block = (dimX, dimY, 1)
+
+        dimGridX = (height // block[0], depth // block[1], 1)
+        dimGridY = (width // block[0], depth // block[1], 1)
+        dimGridZ = (width // block[0], height // block[1], 1)
+
+        func = mod.get_function("SamplesToCoefficients3DXf")
+        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
+                                                          for r in reversed(gpuarray.shape)),
+             block=block,
+             grid=dimGridX)
+        func = mod.get_function("SamplesToCoefficients3DYf")
+        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
+                                                          for r in reversed(gpuarray.shape)),
+             block=block,
+             grid=dimGridY)
+        func = mod.get_function("SamplesToCoefficients3DZf")
+        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
+                                                          for r in reversed(gpuarray.shape)),
+             block=block,
+             grid=dimGridZ)
--- a/pystencils/include/pycuda-helper-modified.hpp
+++ b/pystencils/include/pycuda-helper-modified.hpp
+#include <pycuda-complex.hpp>
+#include <surface_functions.h>
+#ifndef _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP
+#define _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP
+
+extern "C++" {
+  // "double-precision" textures ------------------------------------------------
+  /* Thanks to Nathan Bell <nbell@nvidia.com> for help in figuring this out. */
+
+  typedef float fp_tex_float;
+  typedef int2 fp_tex_double;
+  typedef uint2 fp_tex_cfloat;
+  typedef int4 fp_tex_cdouble;
+
+   template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<float> fp_tex1Dfetch(texture<fp_tex_cfloat, 1, read_mode> tex,  float i)
+  {
+    fp_tex_cfloat v = tex1Dfetch(tex, i);
+    pycuda::complex<float> out;
+    return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<double> fp_tex1Dfetch(texture<fp_tex_cdouble, 1, read_mode> tex,  float i)
+  {
+    fp_tex_cdouble v = tex1Dfetch(tex, i);
+    return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ double fp_tex1Dfetch(texture<fp_tex_double, 1, read_mode> tex,  float i)
+  {
+    fp_tex_double v = tex1Dfetch(tex, i);
+    return __hiloint2double(v.y, v.x);
+  }
+
+// 2D functionality
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ double fp_tex2D(texture<fp_tex_double, 2, read_mode> tex, float i,  float j)
+  {
+    fp_tex_double v = tex2D(tex, i, j);
+    return __hiloint2double(v.y, v.x);
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<float> fp_tex2D(texture<fp_tex_cfloat, 2, read_mode> tex,  float i,  float j)
+  {
+    fp_tex_cfloat v = tex2D(tex, i, j);
+    return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<double> fp_tex2D(texture<fp_tex_cdouble, 2, read_mode> tex,  float i,  float j)
+  {
+    fp_tex_cdouble v = tex2D(tex, i, j);
+    return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+  // 2D Layered extension
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ double fp_tex2DLayered(texture<fp_tex_double, cudaTextureType2DLayered, read_mode> tex, float i,  float j, int layer)
+  {
+    fp_tex_double v = tex2DLayered(tex, i, j, layer);
+    return __hiloint2double(v.y, v.x);
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<float> fp_tex2DLayered(texture<fp_tex_cfloat, cudaTextureType2DLayered, read_mode> tex, float i,  float j, int layer)
+  {
+    fp_tex_cfloat v = tex2DLayered(tex, i, j, layer);
+    return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<double> fp_tex2DLayered(texture<fp_tex_cdouble, cudaTextureType2DLayered, read_mode> tex, float i,  float j, int layer)
+  {
+    fp_tex_cdouble v = tex2DLayered(tex, i, j, layer);
+    return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+
+  // 3D functionality
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ double fp_tex3D(texture<fp_tex_double, 3, read_mode> tex,  float i,  float j,  float k)
+  {
+    fp_tex_double v = tex3D(tex, i, j, k);
+    return __hiloint2double(v.y, v.x);
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<float> fp_tex3D(texture<fp_tex_cfloat, 3, read_mode> tex,  float i,  float j,  float k)
+  {
+    fp_tex_cfloat v = tex3D(tex, i, j, k);
+    return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  template <enum cudaTextureReadMode read_mode>
+  __device__ pycuda::complex<double> fp_tex3D(texture<fp_tex_cdouble, 3, read_mode> tex,  float i,  float j,  float k)
+  {
+    fp_tex_cdouble v = tex3D(tex, i, j, k);
+    return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+
+  // FP_Surfaces with complex supprt
+
+  __device__ void fp_surf2DLayeredwrite(double var,surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_double auxvar;
+    auxvar.x =  __double2loint(var);
+    auxvar.y =  __double2hiint(var);
+    surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_double), j, layer, mode);
+  }
+
+  __device__ void fp_surf2DLayeredwrite(pycuda::complex<float> var,surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cfloat auxvar;
+    auxvar.x =  __float_as_int(var._M_re);
+    auxvar.y =  __float_as_int(var._M_im);
+    surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_cfloat), j, layer,mode);
+  }
+
+  __device__ void fp_surf2DLayeredwrite(pycuda::complex<double> var,surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cdouble auxvar;
+    auxvar.x =  __double2loint(var._M_re);
+    auxvar.y =  __double2hiint(var._M_re);
+
+    auxvar.z = __double2loint(var._M_im);
+    auxvar.w = __double2hiint(var._M_im);
+    surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_cdouble), j, layer,mode);
+  }
+
+  __device__ void fp_surf3Dwrite(double var,surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_double auxvar;
+    auxvar.x =  __double2loint(var);
+    auxvar.y =  __double2hiint(var);
+    surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_double), j, k,mode);
+  }
+
+  __device__ void fp_surf3Dwrite(pycuda::complex<float> var,surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cfloat auxvar;
+    auxvar.x =  __float_as_int(var._M_re);
+    auxvar.y =  __float_as_int(var._M_im);
+
+    surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_cfloat), j, k, mode);
+  }
+
+  __device__ void fp_surf3Dwrite(pycuda::complex<double> var,surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cdouble auxvar;
+    auxvar.x =  __double2loint(var._M_re);
+    auxvar.y =  __double2hiint(var._M_re);
+
+    auxvar.z = __double2loint(var._M_im);
+    auxvar.w = __double2hiint(var._M_im);
+    surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_cdouble), j, k, mode);
+  }
+
+  __device__ void fp_surf2DLayeredread(double *var, surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_double v;
+    surf2DLayeredread(&v, surf, i*sizeof(fp_tex_double), j, layer, mode);
+    *var = __hiloint2double(v.y, v.x);
+  }
+
+  __device__ void fp_surf2DLayeredread(pycuda::complex<float> *var, surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cfloat v;
+    surf2DLayeredread(&v, surf, i*sizeof(fp_tex_cfloat), j, layer, mode);
+    *var = pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  __device__ void fp_surf2DLayeredread(pycuda::complex<double> *var, surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cdouble v;
+    surf2DLayeredread(&v, surf, i*sizeof(fp_tex_cdouble), j, layer, mode);
+    *var = pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+
+  __device__ void fp_surf3Dread(double *var, surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_double v;
+    surf3Dread(&v, surf, i*sizeof(fp_tex_double), j, k, mode);
+    *var = __hiloint2double(v.y, v.x);
+  }
+
+  __device__ void fp_surf3Dread(pycuda::complex<float> *var, surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cfloat v;
+    surf3Dread(&v, surf, i*sizeof(fp_tex_cfloat), j, k, mode);
+    *var = pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y));
+  }
+
+  __device__ void fp_surf3Dread(pycuda::complex<double> *var, surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode)
+  {
+    fp_tex_cdouble v;
+    surf3Dread(&v, surf, i*sizeof(fp_tex_cdouble), j, k, mode);
+    *var = pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
+  }
+#define PYCUDA_GENERATE_FP_TEX_FUNCS(TYPE) \
+  template <enum cudaTextureReadMode read_mode> \
+  __device__ TYPE fp_tex1Dfetch(texture<TYPE, 1, read_mode> tex,  float i) \
+  { \
+    return tex1Dfetch(tex, i); \
+  } \
+  template <enum cudaTextureReadMode read_mode> \
+  __device__ TYPE fp_tex2D(texture<TYPE, 2, read_mode> tex,  float i,  float j) \
+  { \
+    return tex2D(tex, i, j); \
+  } \
+  template <enum cudaTextureReadMode read_mode> \
+   __device__ TYPE fp_tex2DLayered(texture<TYPE, cudaTextureType2DLayered, read_mode> tex,  float i,  float j, int layer) \
+  { \
+    return tex2DLayered(tex, i, j, layer); \
+  } \
+  template <enum cudaTextureReadMode read_mode> \
+  __device__ TYPE fp_tex3D(texture<TYPE, 3, read_mode> tex,  float i,  float j,  float k) \
+  { \
+    return tex3D(tex, i, j, k); \
+  } \
+  __device__ void fp_surf2DLayeredwrite(TYPE var,surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer,enum cudaSurfaceBoundaryMode mode) \
+  { \
+    surf2DLayeredwrite(var, surf, i*sizeof(TYPE), j, layer, mode); \
+  } \
+  __device__ void fp_surf2DLayeredread(TYPE *var, surface<void, cudaSurfaceType2DLayered> surf,  float i,  float j, int layer,enum cudaSurfaceBoundaryMode mode) \
+  { \
+    surf2DLayeredread(var, surf, i*sizeof(TYPE), j, layer, mode); \
+  } \
+  __device__ void fp_surf3Dwrite(TYPE var,surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode) \
+  { \
+    surf3Dwrite(var, surf, i*sizeof(TYPE), j, k, mode); \
+  } \
+  __device__ void fp_surf3Dread(TYPE *var, surface<void, 3> surf,  float i,  float j,  float k, enum cudaSurfaceBoundaryMode mode) \
+  { \
+    surf3Dread(var, surf, i*sizeof(TYPE), j, k, mode); \
+  }
+  PYCUDA_GENERATE_FP_TEX_FUNCS(float)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(int)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned int)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(short int)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned short int)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(char)
+  PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned char)
+}
+
+#endif
--- a/pystencils/integer_functions.py
+++ b/pystencils/integer_functions.py
@@ -3,13 +3,13 @@ import sympy as sp
 from pystencils.data_types import collate_types, get_type_of_expression
 from pystencils.sympyextensions import is_integer_sequence

-bitwise_xor = sp.Function("bitwise_xor")
-bit_shift_right = sp.Function("bit_shift_right")
-bit_shift_left = sp.Function("bit_shift_left")
-bitwise_and = sp.Function("bitwise_and")
-bitwise_or = sp.Function("bitwise_or")
-int_div = sp.Function("int_div")
-int_power_of_2 = sp.Function("int_power_of_2")
+bitwise_xor = sp.Function("bitwise_xor", integer=True)
+bit_shift_right = sp.Function("bit_shift_right", integer=True)
+bit_shift_left = sp.Function("bit_shift_left", integer=True)
+bitwise_and = sp.Function("bitwise_and", integer=True)
+bitwise_or = sp.Function("bitwise_or", integer=True)
+int_div = sp.Function("int_div", integer=True)
+int_power_of_2 = sp.Function("int_power_of_2", integer=True)


 # noinspection PyPep8Naming
@@ -29,6 +29,7 @@ class modulo_floor(sp.Function):
        '(int64_t)((a) / (b)) * (b)'
    """
    nargs = 2
+    integer = True

    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -60,6 +61,7 @@ class modulo_ceil(sp.Function):
        '((a) % (b) == 0 ? a : ((int64_t)((a) / (b))+1) * (b))'
    """
    nargs = 2
+    integer = True

    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -89,6 +91,7 @@ class div_ceil(sp.Function):
        '( (a) % (b) == 0 ? (int64_t)(a) / (int64_t)(b) : ( (int64_t)(a) / (int64_t)(b) ) +1 )'
    """
    nargs = 2
+    integer = True

    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -118,6 +121,7 @@ class div_floor(sp.Function):
        '((int64_t)(a) / (int64_t)(b))'
    """
    nargs = 2
+    integer = True

    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):

--- a/pystencils/interpolation_astnodes.py
+++ b/pystencils/interpolation_astnodes.py
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
+#
+# Distributed under terms of the GPLv3 license.
+
+"""
+
+"""
+
+import itertools
+from enum import Enum
+from typing import Set
+
+import sympy as sp
+from sympy.core.cache import cacheit
+
+import pystencils
+from pystencils.astnodes import Node
+from pystencils.data_types import TypedSymbol, cast_func, create_type
+
+try:
+    import pycuda.driver
+except Exception:
+    pass
+
+
+class InterpolationMode(str, Enum):
+    NEAREST_NEIGHBOR = "nearest_neighbour"
+    NN = NEAREST_NEIGHBOR
+    LINEAR = "linear"
+    CUBIC_SPLINE = "cubic_spline"
+
+
+class Interpolator(object):
+    """
+    Implements non-integer accesses on fields using linear interpolation.
+
+    On GPU, this interpolator can be implemented by a `TextureCachedField` for hardware acceleration.
+
+    Address modes are different boundary handlings possible choices are like for CUDA textures
+
+        **CLAMP**
+
+        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
+
+        **BORDER**
+
+        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
+
+        Now, to describe the last two address modes, we are forced to consider normalized coordinates,
+        so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
+
+        **WRAP**
+
+        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
+        In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
+
+        **MIRROR**
+
+        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
+        In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
+
+    Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
+    """
+
+    required_global_declarations = []
+
+    def __init__(self,
+                 parent_field: pystencils.Field,
+                 interpolation_mode: InterpolationMode,
+                 address_mode='BORDER',
+                 use_normalized_coordinates=False):
+        super().__init__()
+
+        self.field = parent_field.new_field_with_different_name(parent_field.name)
+        self.field.field_type = pystencils.field.FieldType.CUSTOM
+        self.address_mode = address_mode
+        self.use_normalized_coordinates = use_normalized_coordinates
+        hash_str = "%x" % abs(hash(self.field) + hash(address_mode))
+        self.symbol = TypedSymbol('dummy_symbol_carrying_field' + self.field.name + hash_str,
+                                  'dummy_symbol_carrying_field' + self.field.name + hash_str)
+        self.symbol.field = self.field
+        self.symbol.interpolator = self
+        self.interpolation_mode = interpolation_mode
+
+    def at(self, offset):
+        return InterpolatorAccess(self.symbol, *offset)
+
+    def __getitem__(self, offset):
+        return InterpolatorAccess(self.symbol, *offset)
+
+    def __str__(self):
+        return '%s_interpolator' % self.field.name
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __hash__(self):
+        return hash((str(self.address_mode),
+                     str(type(self)),
+                     self.symbol,
+                     self.address_mode,
+                     self.use_normalized_coordinates))
+
+
+class LinearInterpolator(Interpolator):
+
+    def __init__(self,
+                 parent_field: pystencils.Field,
+                 address_mode='BORDER',
+                 use_normalized_coordinates=False):
+        super().__init__(parent_field,
+                         InterpolationMode.LINEAR,
+                         address_mode,
+                         use_normalized_coordinates)
+
+
+class NearestNeightborInterpolator(Interpolator):
+
+    def __init__(self,
+                 parent_field: pystencils.Field,
+                 address_mode='BORDER',
+                 use_normalized_coordinates=False):
+        super().__init__(parent_field,
+                         InterpolationMode.NN,
+                         address_mode,
+                         use_normalized_coordinates)
+
+
+class InterpolatorAccess(TypedSymbol):
+    def __new__(cls, field, offsets, *args, **kwargs):
+        obj = TextureAccess.__xnew_cached_(cls, field, offsets, *args, **kwargs)
+        return obj
+
+    def __new_stage2__(self, symbol, *offsets):
+        assert offsets is not None
+        obj = super().__xnew__(self, '%s_interpolator_%x' %
+                               (symbol.field.name, abs(hash(tuple(offsets)))), symbol.field.dtype)
+        obj.offsets = offsets
+        obj.symbol = symbol
+        obj.field = symbol.field
+        obj.interpolator = symbol.interpolator
+        return obj
+
+    def __hash__(self):
+        return hash((self.symbol, self.field, tuple(self.offsets), self.interpolator))
+
+    def __str__(self):
+        return '%s_interpolator(%s)' % (self.field.name, ','.join(str(o) for o in self.offsets))
+
+    def __repr__(self):
+        return self.__str__()
+
+    def atoms(self, *types):
+        if self.offsets:
+            offsets = set(o for o in self.offsets if isinstance(o, types))
+            if isinstance(self, *types):
+                offsets.update([self])
+            for o in self.offsets:
+                if hasattr(o, 'atoms'):
+                    offsets.update(set(o.atoms(types)))
+            return offsets
+        else:
+            return set()
+
+    @property
+    def free_symbols(self):
+        symbols = set()
+        if self.offsets is not None:
+            for o in self.offsets:
+                if hasattr(o, 'free_symbols'):
+                    symbols.update(set(o.free_symbols))
+                # if hasattr(o, 'atoms'):
+                    # symbols.update(set(o.atoms(sp.Symbol)))
+
+        return symbols
+
+    @property
+    def args(self):
+        return [self.symbol, *self.offsets]
+
+    @property
+    def symbols_defined(self) -> Set[sp.Symbol]:
+        return {self}
+
+    @property
+    def interpolation_mode(self):
+        return self.interpolator.interpolation_mode
+
+    def implementation_with_stencils(self):
+        field = self.field
+
+        default_int_type = create_type('uint32')
+        use_textures = isinstance(self, TextureAccess)
+        if use_textures:
+            def absolute_access(x, _):
+                return self.texture.at((o for o in x))
+        else:
+            absolute_access = field.absolute_access
+
+        sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
+
+        offsets = self.offsets
+        rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
+
+        for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
+            if self.interpolation_mode == InterpolationMode.NN:
+                if use_textures:
+                    sum[channel_idx] = self
+                else:
+                    sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
+
+            elif self.interpolation_mode == InterpolationMode.LINEAR:
+                # TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
+                for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
+                    weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
+                    index = [f(offset) for (f, offset) in zip(c, offsets)]
+                    # Hardware boundary handling on GPU
+                    if use_textures:
+                        weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
+                        sum[channel_idx] += \
+                            weight * absolute_access(index, channel_idx if field.index_dimensions else ())
+                    # else boundary handling using software
+                    elif str(self.interpolator.address_mode).lower() == 'border':
+                        is_inside_field = sp.And(
+                            *itertools.chain([i >= 0 for i in index],
+                                             [idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
+                        index = [cast_func(i, default_int_type) for i in index]
+                        sum[channel_idx] += sp.Piecewise(
+                            (weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
+                                is_inside_field),
+                            (sp.simplify(0), True)
+                        )
+                    elif str(self.interpolator.address_mode).lower() == 'clamp':
+                        index = [cast_func(sp.Min(sp.Max(0, i), field.shape[dim] - 1), default_int_type)
+                                 for (dim, i) in enumerate(index)]
+                        sum[channel_idx] += weight * \
+                            absolute_access(index, channel_idx if field.index_dimensions else ())
+                    elif str(self.interpolator.address_mode).lower() == 'wrap':
+                        index = [cast_func(sp.Piecewise((sp.Mod(i, field.shape[dim]), i >= 0),
+                                                        (field.shape[dim] + sp.Mod(i, field.shape[dim]), True)),
+                                           default_int_type)
+                                 for (dim, i) in enumerate(index)]
+                        sum[channel_idx] += weight * \
+                            absolute_access(index, channel_idx if field.index_dimensions else ())
+                    elif str(self.interpolator.address_mode).lower() == 'mirror':
+                        def triangle_fun(x, half_period):
+                            saw_tooth = sp.Abs(x) % (2 * half_period)
+                            return sp.Piecewise((saw_tooth, saw_tooth < half_period),
+                                                (2 * half_period - 1 - saw_tooth, True))
+                        index = [cast_func(triangle_fun(i, field.shape[dim]),
+                                           default_int_type) for (dim, i) in enumerate(index)]
+                        sum[channel_idx] += weight * \
+                            absolute_access(index, channel_idx if field.index_dimensions else ())
+                    else:
+                        raise NotImplementedError()
+            elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
+                raise NotImplementedError("only works with HW interpolation for float32")
+
+            sum = [sp.factor(s) for s in sum]
+
+            if field.index_dimensions:
+                return sp.Matrix(sum)
+            else:
+                return sum[0]
+
+    # noinspection SpellCheckingInspection
+    __xnew__ = staticmethod(__new_stage2__)
+    # noinspection SpellCheckingInspection
+    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+
+##########################################################################################
+# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
+##########################################################################################
+
+
+class TextureCachedField:
+
+    def __init__(self, parent_field,
+                 address_mode=None,
+                 filter_mode=None,
+                 interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
+                 use_normalized_coordinates=False,
+                 read_as_integer=False
+                 ):
+        if isinstance(address_mode, str):
+            address_mode = getattr(pycuda.driver.address_mode, address_mode.upper())
+
+        if address_mode is None:
+            address_mode = pycuda.driver.address_mode.BORDER
+        if filter_mode is None:
+            filter_mode = pycuda.driver.filter_mode.LINEAR
+
+        # self, field_name, field_type, dtype, layout, shape, strides
+        self.field = parent_field
+        self.required_global_declarations = [TextureDeclaration(self)]
+        self.address_mode = address_mode
+        self.filter_mode = filter_mode
+        self.read_as_integer = read_as_integer
+        self.use_normalized_coordinates = use_normalized_coordinates
+        self.symbol = TypedSymbol(str(self), self.field.dtype.numpy_dtype)
+        self.symbol.interpolator = self
+        self.symbol.field = self.field
+        self.interpolation_mode = interpolation_mode
+
+        # assert str(self.field.dtype) != 'double', "CUDA does not support double textures!"
+        # assert dtype_supports_textures(self.field.dtype), "CUDA only supports texture types with 32 bits or less"
+
+    @classmethod
+    def from_interpolator(cls, interpolator: LinearInterpolator):
+        obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
+        return obj
+
+    def at(self, offset):
+        return TextureAccess(self.symbol, *offset)
+
+    def __getitem__(self, offset):
+        return TextureAccess(self.symbol, *offset)
+
+    def __str__(self):
+        return '%s_texture_%x' % (self.field.name, abs(hash(self.field) + hash(str(self.address_mode))))
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class TextureAccess(InterpolatorAccess):
+    def __new__(cls, texture_symbol, offsets, *args, **kwargs):
+        obj = TextureAccess.__xnew_cached_(cls, texture_symbol, offsets, *args, **kwargs)
+        return obj
+
+    def __new_stage2__(self, symbol, *offsets):
+        obj = super().__xnew__(self, symbol, *offsets)
+        obj.required_global_declarations = symbol.interpolator.required_global_declarations
+        obj.required_global_declarations[0]._symbols_defined.add(obj)
+        return obj
+
+    def __str__(self):
+        return '%s_texture(%s)' % (self.interpolator.field.name, ','.join(str(o) for o in self.offsets))
+
+    @property
+    def texture(self):
+        return self.interpolator
+
+    # noinspection SpellCheckingInspection
+    __xnew__ = staticmethod(__new_stage2__)
+    # noinspection SpellCheckingInspection
+    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+
+
+class TextureDeclaration(Node):
+
+    def __init__(self, parent_texture):
+        self.texture = parent_texture
+        self._symbols_defined = set()
+
+    @property
+    def symbols_defined(self) -> Set[sp.Symbol]:
+        return self._symbols_defined
+
+    @property
+    def args(self) -> Set[sp.Symbol]:
+        return set()
+
+
+def dtype_supports_textures(dtype):
+    """
+    Returns whether CUDA supports texture fetches with this numpy dtype.
+
+    The maximum word size for a texture fetch is four bytes.
+
+    With this trick also larger dtypes can be fetched:
+    https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
+
+    """
+    if hasattr(dtype, 'numpy_dtype'):
+        dtype = dtype.numpy_dtype
+
+    if isinstance(dtype, type):
+        return dtype().itemsize <= 4
+
+    return dtype.itemsize <= 4
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
@@ -12,10 +12,18 @@ from pystencils.transformations import (
    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)


-def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
+def create_kernel(assignments,
+                  target='cpu',
+                  data_type="double",
+                  iteration_slice=None,
+                  ghost_layers=None,
                  skip_independence_check=False,
-                  cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
-                  gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
+                  cpu_openmp=False,
+                  cpu_vectorize_info=None,
+                  cpu_blocking=None,
+                  gpu_indexing='block',
+                  gpu_indexing_params=MappingProxyType({}),
+                  use_textures_for_interpolation=True):
    """
    Creates abstract syntax tree (AST) of kernel, using a list of update equations.

@@ -99,14 +107,22 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
        ast = create_cuda_kernel(assignments, type_info=data_type,
                                 indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
                                 iteration_slice=iteration_slice, ghost_layers=ghost_layers,
-                                 skip_independence_check=skip_independence_check)
+                                 skip_independence_check=skip_independence_check,
+                                 use_textures_for_interpolation=use_textures_for_interpolation)
        return ast
    else:
        raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))


-def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'),
-                          cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
+def create_indexed_kernel(assignments,
+                          index_fields,
+                          target='cpu',
+                          data_type="double",
+                          coordinate_names=('x', 'y', 'z'),
+                          cpu_openmp=True,
+                          gpu_indexing='block',
+                          gpu_indexing_params=MappingProxyType({}),
+                          use_textures_for_interpolation=True):
    """
    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
@@ -159,8 +175,12 @@ def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="do
    elif target == 'gpu':
        from pystencils.gpucuda import created_indexed_cuda_kernel
        idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
-        ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type,
-                                          coordinate_names=coordinate_names, indexing_creator=idx_creator)
+        ast = created_indexed_cuda_kernel(assignments,
+                                          index_fields,
+                                          type_info=data_type,
+                                          coordinate_names=coordinate_names,
+                                          indexing_creator=idx_creator,
+                                          use_textures_for_interpolation=use_textures_for_interpolation)
        return ast
    else:
        raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))

--- a/pystencils/spatial_coordinates.py
+++ b/pystencils/spatial_coordinates.py
+
+import sympy
+
+import pystencils
+import pystencils.astnodes
+
+x_, y_, z_ = tuple(pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(3))
+x_staggered, y_staggered, z_staggered = x_ + 0.5, y_ + 0.5, z_ + 0.5
+
+
+def x_vector(ndim):
+    return sympy.Matrix(tuple(pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(ndim)))
+
+
+def x_staggered_vector(ndim):
+    return sympy.Matrix(tuple(
+        pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) + 0.5 for i in range(ndim)
+    ))
--- a/pystencils/test_gettypeofexpression.py
+++ b/pystencils/test_gettypeofexpression.py
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
+#
+# Distributed under terms of the GPLv3 license.
+
+"""
+
+"""
+import sympy as sp
+
+from pystencils.data_types import create_type, get_type_of_expression
+
+
+def test_gettypeofexpression():
+    random_constant = sp.sqrt(sp.pi / 5)
+    type = get_type_of_expression(random_constant)
+    type == create_type('float64')
+
+    random_constant = sp.pi
+    type = get_type_of_expression(random_constant)
+    type == create_type('float64')
+
+    random_constant = sp.pi / sp.pi
+    type = get_type_of_expression(random_constant)
+    type == create_type('int64')
+
+
+def main():
+    test_gettypeofexpression()
+
+
+if __name__ == '__main__':
+    main()
--- a/pystencils/transformations.py
+++ b/pystencils/transformations.py
@@ -9,11 +9,13 @@ import sympy as sp
 from sympy.logic.boolalg import Boolean

 import pystencils.astnodes as ast
+import pystencils.integer_functions
 from pystencils.assignment import Assignment
 from pystencils.data_types import (
    PointerType, StructType, TypedSymbol, cast_func, collate_types, create_type, get_base_type,
    get_type_of_expression, pointer_arithmetic_func, reinterpret_cast_func)
 from pystencils.field import AbstractField, Field, FieldType
+from pystencils.interpolation_astnodes import InterpolatorAccess, TextureAccess, TextureCachedField
 from pystencils.kernelparameters import FieldPointerSymbol
 from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.slicing import normalize_slice
@@ -503,7 +505,10 @@ def resolve_field_accesses(ast_node, read_only_field_names=set(),
            if isinstance(expr, ast.ResolvedFieldAccess):
                return expr

-            new_args = [visit_sympy_expr(e, enclosing_block, sympy_assignment) for e in expr.args]
+            if hasattr(expr, 'args'):
+                new_args = [visit_sympy_expr(e, enclosing_block, sympy_assignment) for e in expr.args]
+            else:
+                new_args = []
            kwargs = {'evaluate': False} if type(expr) in (sp.Add, sp.Mul, sp.Piecewise) else {}
            return expr.func(*new_args, **kwargs) if new_args else expr

@@ -521,7 +526,9 @@ def resolve_field_accesses(ast_node, read_only_field_names=set(),
            if sub_ast.false_block:
                visit_node(sub_ast.false_block)
        else:
-            for i, a in enumerate(sub_ast.args):
+            if isinstance(sub_ast, (bool, int, float)):
+                return
+            for a in sub_ast.args:
                visit_node(a)

    return visit_node(ast_node)
@@ -798,12 +805,26 @@ class KernelConstraintsCheck:
            self.fields_read.add(rhs.field)
            self.fields_read.update(rhs.indirect_addressing_fields)
            return rhs
+        elif isinstance(rhs, InterpolatorAccess):
+            new_args = [self.process_expression(arg, type_constants) for arg in rhs.offsets]
+            if new_args:
+                rhs.offsets = new_args
+            return rhs
        elif isinstance(rhs, TypedSymbol):
            return rhs
        elif isinstance(rhs, sp.Symbol):
            return TypedSymbol(rhs.name, self._type_for_symbol[rhs.name])
        elif type_constants and isinstance(rhs, sp.Number):
            return cast_func(rhs, create_type(self._type_for_symbol['_constant']))
+        elif isinstance(rhs, sp.boolalg.BooleanFunction) or \
+                type(rhs) in pystencils.integer_functions.__dict__.values():
+            new_args = [self.process_expression(a, type_constants) for a in rhs.args]
+            types_of_expressions = [get_type_of_expression(a) for a in new_args]
+            arg_type = collate_types(types_of_expressions, forbid_collation_to_float=True)
+            new_args = [a if not hasattr(a, 'dtype') or a.dtype == arg_type
+                        else cast_func(a, arg_type)
+                        for a in new_args]
+            return rhs.func(*new_args)
        elif isinstance(rhs, sp.Mul):
            new_args = [self.process_expression(arg, type_constants) if arg not in (-1, 1) else arg for arg in rhs.args]
            return rhs.func(*new_args) if new_args else rhs
@@ -955,7 +976,8 @@ def insert_casts(node):
        # TODO optimize pow, don't cast integer on double
        types = [get_type_of_expression(arg) for arg in args]
        assert len(types) > 0
-        target = collate_types(types)
+        # Never ever, ever collate to float type for boolean functions!
+        target = collate_types(types, forbid_collation_to_float=isinstance(node.func, sp.boolalg.BooleanFunction))
        zipped = list(zip(args, types))
        if target.func is PointerType:
            assert node.func is sp.Add
@@ -1150,7 +1172,6 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:

 def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int:
    """Blocking of loops to enhance cache locality. Modifies the ast node in-place.
-
    Args:
        ast_node: kernel function node before vectorization transformation has been applied
        block_size: sequence defining block size in x, y, (z) direction
@@ -1195,3 +1216,48 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int:
        inner_loop.start = block_ctr
        inner_loop.stop = stop
    return len(coordinates)
+
+
+def implement_interpolations(ast_node: ast.Node,
+                             implement_by_texture_accesses: bool = False,
+                             vectorize: bool = False,
+                             use_hardware_interpolation_for_f32=True):
+    # TODO: perform this function on assignments, when unify_shape_symbols allows differently sized fields
+
+    assert not(implement_by_texture_accesses and vectorize), \
+        "can only implement interpolations either by texture accesses or CPU vectorization"
+    FLOAT32_T = create_type('float32')
+
+    interpolation_accesses = ast_node.atoms(InterpolatorAccess)
+
+    def can_use_hw_interpolation(i):
+        return use_hardware_interpolation_for_f32 and i.dtype == FLOAT32_T and isinstance(i, TextureAccess)
+
+    if implement_by_texture_accesses:
+
+        interpolators = {a.symbol.interpolator for a in interpolation_accesses}
+        to_texture_map = {i: TextureCachedField.from_interpolator(i) for i in interpolators}
+
+        substitutions = {i: to_texture_map[i.symbol.interpolator].at(
+            [o for o in i.offsets]) for i in interpolation_accesses}
+
+        import pycuda.driver as cuda
+        for texture in substitutions.values():
+            if can_use_hw_interpolation(texture):
+                texture.filter_mode = cuda.filter_mode.LINEAR
+            else:
+                texture.filter_mode = cuda.filter_mode.POINT
+                texture.read_as_integer = True
+
+        ast_node.subs(substitutions)
+
+        # Update after replacements
+        interpolation_accesses = ast_node.atoms(InterpolatorAccess)
+
+    if vectorize:
+        # TODO can be done in _interpolator_access_to_stencils field.absolute_access == simd_gather
+        raise NotImplementedError()
+    else:
+        substitutions = {i: i.implementation_with_stencils()
+                         for i in interpolation_accesses if not can_use_hw_interpolation(i)}
+        ast_node.subs(substitutions)
No results found