Compare revisions

e0d50917 · e0d50917 · e0d50917 · e0d50917 · e0d50917 · e0d50917
--- a/docs/source/tutorials/demo_derivatives.py
+++ b/docs/source/tutorials/demo_derivatives.py
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.7
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+from pystencils.session import *
+
+# %% [markdown]
+# # Demo: Working with derivatives
+#
+#
+# ## Overview
+# This notebook demonstrates how to formulate continuous differential operators in *pystencils* and automatically derive finite difference stencils from them.
+#
+# Instead of using the built-in derivatives in *sympy*, *pystencils* comes with its own derivative objects. They represent spatial derivatives of pystencils fields.
+
+# %%
+f = ps.fields("f: [2D]")
+first_derivative_x = ps.fd.diff(f, 0)
+first_derivative_x
+
+# %% [markdown]
+# This object is the derivative of the field $f$ with respect to the first spatial coordinate $x$. To get a finite difference approximation a discretization strategy is required:
+
+# %%
+discretize_2nd_order = ps.fd.Discretization2ndOrder(dx=sp.symbols("h"))
+discretize_2nd_order(first_derivative_x)
+
+# %% [markdown]
+# Strictly speaking, derivative objects act on *field accesses*, not *fields*, that why there is a $(0,0)$ index at the field:
+
+# %%
+first_derivative_x
+
+# %% [markdown]
+# Sometimes it might be useful to specify derivatives at an offset e.g.
+
+# %%
+derivative_offset = ps.fd.diff(f[0, 1], 0)
+derivative_offset, discretize_2nd_order(derivative_offset)
+
+# %% [markdown]
+# Another example with second order derivatives:
+
+# %%
+laplacian = ps.fd.diff(f, 0, 0) + ps.fd.diff(f, 1, 1)
+laplacian
+
+# %%
+discretize_2nd_order(laplacian)
+
+# %% [markdown]
+# ## Working with derivative terms
+#
+# No automatic simplifications are done on derivative terms i.e. linearity relations or product rule are not applied automatically.
+
+# %%
+f, g = ps.fields("f, g :[2D]")
+c = sp.symbols("c")
+δ = ps.fd.diff
+
+expr = δ( δ(f, 0) +  δ(g, 0) + c + 5 , 0)
+expr
+
+# %% [markdown]
+# This nested term can not be discretized automatically. 
+
+# %%
+try:
+    discretize_2nd_order(expr)
+except ValueError as e:
+    print(e)
+
+# %% [markdown]
+# ### Linearity
+# The following function expands all derivatives exploiting linearity:
+
+# %%
+ps.fd.expand_diff_linear(expr)
+
+# %% [markdown]
+# The symbol $c$ that was included is interpreted as a function by default. 
+# We can control the simplification behaviour by specifying all functions or all constants:
+
+# %%
+ps.fd.expand_diff_linear(expr, functions=(f[0,0], g[0, 0]))
+
+# %%
+ps.fd.expand_diff_linear(expr, constants=[c])
+
+# %% [markdown]
+# The expanded term can then be discretized:
+
+# %%
+discretize_2nd_order(ps.fd.expand_diff_linear(expr, constants=[c]))
+
+# %% [markdown]
+# ### Product rule
+#
+# The next cells show how to apply product rule and its reverse:
+
+# %%
+expr = δ(f[0, 0] * g[0, 0], 0 )
+expr
+
+# %%
+expanded_expr = ps.fd.expand_diff_products(expr)
+expanded_expr
+
+# %%
+recombined_expr = ps.fd.combine_diff_products(expanded_expr)
+recombined_expr
+
+# %%
+assert recombined_expr == expr
+
+# %% [markdown]
+# ### Evaluate derivatives
+#
+# Arguments of derivative need not be to be fields, only when trying to discretize them. 
+# The next cells show how to transform them to *sympy* derivatives and evaluate them.
+
+# %%
+k = sp.symbols("k")
+expr = δ(k**3 + 2 * k, 0 )
+expr
+
+# %%
+ps.fd.evaluate_diffs(expr, var=k)
--- a/docs/source/tutorials/demo_plotting_and_animation.py
+++ b/docs/source/tutorials/demo_plotting_and_animation.py
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.7
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %%
+from pystencils.session import *
+
+import shutil
+
+# %%
+try:
+    from scipy.ndimage import gaussian_filter
+    scipy_avail = True
+except ImportError:
+    scipy_avail = False
+
+# %% [markdown]
+# # Plotting and Animation
+#
+#
+# ## 1) Stencils
+#
+# This section shows how to visualize stencils, i.e. lists of directions
+
+# %%
+stencil_5pt = [(0, 0), (1, 0), (-1, 0), (0, 1), (0, -1)]
+plt.figure(figsize=(2, 2))
+ps.stencil.plot(stencil_5pt)
+
+# %% [markdown]
+# By default the indices are shown the arrow endpoints. 
+# Custom data can also be plotted:
+
+# %%
+plt.figure(figsize=(2, 2))
+ps.stencil.plot(stencil_5pt, data=[2.0, 5, 'test', 42, 1])
+
+# %% [markdown]
+# The stencil can also be given as an expression in a single field
+
+# %%
+f = ps.fields("f: [2D]")
+c = sp.symbols("c")
+expr = f[0, 0] + 2 * f[1, 0] + c * f[-1, 0] + 4 * f[0, 1] - c**2 * f[0, -1]
+plt.figure(figsize=(2, 2))
+ps.stencil.plot_expression(expr)
+
+# %% [markdown]
+# There is also a function to simply plot sympy expressions depending on one variable only:
+
+# %%
+x = sp.symbols("x")
+plt.figure(figsize=(4, 4))
+plt.sympy_function(x**2 - 1, x_values=np.linspace(-2, 2));
+
+# %% [markdown]
+# Three dimensional stencils can be visualized in two different ways
+
+# %%
+stencil_7pt = [(0, 0, 0), (1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)]
+ps.stencil.plot(stencil_7pt, data=[i for i in range(7)])
+
+# %% [markdown]
+# This kind of plot works well for small stencils, for larger stencils this gets messy:
+
+# %%
+stencil_27pt = [(i, j, k) for i in (-1, 0, 1) for j in (-1, 0, 1) for k in (-1, 0, 1)]
+ps.stencil.plot(stencil_27pt, data=[i for i in range(27)])
+
+# %% [markdown]
+# Adding `slice=True` shows the three projections of the stencils instead
+
+# %%
+ps.stencil.plot(stencil_27pt, data=[i for i in range(27)], slice=True)
+
+
+# %% [markdown]
+# ## 2) Scalar fields
+#
+# *pystencils* also comes with helper functions to plot simulation data. 
+# The plotting functions are simple but useful wrappers around basic Matplotlib functionality. 
+# *pystencils* imports all matplotlib functions as well, such that one can use
+#
+# ```import pystencils.plot as plt```
+#
+# instead of 
+#
+# ```import matplotlib.pyplot as plt```
+#
+# The session import at the top of the notebook does this already, and also sets up inline plotting for notebooks.
+#
+# This section demonstrates how to plot 2D scalar fields. Internally `imshow` from matplotlib is used, however the coordinate system is changed such that (0,0) is in the lower left corner,  the $x$-axis points to the right, and the $y$-axis upward. 
+#
+# The next function returns a scalar field for demonstration
+
+# %%
+def example_scalar_field(t=0):
+    x, y = np.meshgrid(np.linspace(0, 2 * np.pi, 100), np.linspace(0, 2 * np.pi, 100))
+    z = np.cos(x + 0.1 * t) * np.sin(y + 0.1 * t) + 0.1 * x * y
+    return z
+
+
+# %% [markdown]
+# To show a single scalar field use `plt.scalar_field`:
+
+# %%
+arr = example_scalar_field()
+plt.scalar_field(arr)
+plt.colorbar()
+
+# %% [markdown]
+# A similar wrapper around `counter` from matplotlib
+
+# %%
+plt.scalar_field_contour(arr);
+
+# %% [markdown]
+# A 3D surface plot is also possible:
+
+# %%
+plt.scalar_field_surface(arr)
+
+# %% [markdown]
+# For simulation results one often needs visualization of time dependent results. To show an evolving scalar field an animation can be created as shown in the next cell
+
+# %%
+t = 0
+def run_func():
+    global t
+    t += 1
+    return example_scalar_field(t)
+
+
+# %%
+if shutil.which("ffmpeg") is not None:
+    plt.figure()
+    animation = plt.scalar_field_animation(run_func, frames=60)
+    ps.jupyter.display_as_html_video(animation)
+else:
+    print("No ffmpeg installed")
+
+# %% [markdown]
+# Another way to display animations is as an image sequence. While the `run_func` i.e. the simulation is running the current state is then displayed as HTML image. Use this method when your `run_func` takes a long time and you want to see the status while the simulation runs. The `frames` parameter specifies how often the run function will be called.
+
+# %%
+plt.figure()
+animation = plt.scalar_field_animation(run_func, frames=30)
+ps.jupyter.display_as_html_image(animation)
+
+# %% [markdown]
+# For surface plots there is also an animated version:
+
+# %%
+if shutil.which("ffmpeg") is not None:
+    animation = plt.surface_plot_animation(run_func, frames=60)
+    ps.jupyter.display_as_html_video(animation)
+else:
+    print("No ffmpeg installed")
+
+
+# %% [markdown]
+# ## 3) Vector Fields
+#
+# *pystencils* provides another set of plotting functions for vector fields, e.g. velocity fields. 
+# They have three index dimensions, where the last coordinate has to have 2 entries, representing the $x$ and $y$ component of the vector.
+
+# %%
+def example_vector_field(t=0, shape=(40, 40)):
+    result = np.empty(shape + (2,))
+    x, y = np.meshgrid(np.linspace(0, 2 * np.pi, shape[0]), np.linspace(0, 2 * np.pi, shape[1]))
+    result[..., 0] = np.cos(x + 0.1 * t) * np.sin(y + 0.1 * t) + 0.01 * x * y
+    result[..., 1] = np.cos(0.001 * y)
+    return result
+
+
+# %% [markdown]
+# Vector fields can either be plotted using quivers (arrows)
+
+# %%
+plt.figure()
+data = example_vector_field()
+plt.vector_field(data, step=1);
+
+# %% [markdown]
+# The `step` parameter can be used to display only every $n$'th arrow:
+
+# %%
+plt.figure()
+plt.vector_field(data, step=2);
+
+# %% [markdown]
+# or by displaying their magnitude in a colormap
+
+# %%
+plt.figure()
+plt.vector_field_magnitude(data);
+
+# %% [markdown]
+# There are also functions to animate both variants, quiver plots..
+
+# %%
+t = 0
+def run_func():
+    global t
+    t += 1
+    return example_vector_field(t)
+
+
+# %%
+if shutil.which("ffmpeg") is not None:
+    plt.figure()
+    animation = plt.vector_field_animation(run_func, frames=60)
+    ps.jupyter.display_as_html_video(animation)
+else:
+    print("No ffmpeg installed")
+
+# %% [markdown]
+# ...and magnitude plots
+
+# %%
+if shutil.which("ffmpeg") is not None:
+    animation = plt.vector_field_magnitude_animation(run_func, frames=60)
+    ps.jupyter.display_as_html_video(animation)
+else:
+    print("No ffmpeg installed")
+
+
+# %% [markdown]
+# ## 4) Phase Fields
+#
+# A third group of plotting functions helps to display arrays as they occur in phase-field simulations. However these function may also be useful for other kinds of simulations. 
+# They expect arrays where the last coordinate indicates the fraction of a certain component, i.e. `arr[x, y, 2]` should be a value between $0$ and $1$ and specifies the fraction of the third phase at $(x, y)$. The plotting functions expect that sum over the last coordinate gives $1$ for all cells.
+#
+# Lets again generate some example data
+
+# %%
+def example_phase_field():
+    if scipy_avail:
+        from scipy.ndimage import gaussian_filter
+
+    shape=(80, 80)
+    result = np.zeros(shape + (4,))
+    result[20:40, 20:40, 0] = 1
+    
+    if scipy_avail:
+        gaussian_filter(result[..., 0], sigma=3, output=result[..., 0])
+    result[50:70, 30:50, 1] = 1
+    if scipy_avail:
+        gaussian_filter(result[..., 1], sigma=3, output=result[..., 1])
+
+    result[:, :10, 2] = 1
+    result[:, :, 3] = 1 - np.sum(result, axis=2)
+    return result
+data = example_phase_field()
+
+# %% [markdown]
+# The `scalar_field_alpha_value` function uses the last entry, i.e. the value between 0 and 1, as the alpha value of the specified colour to show where the phase is located. This visualization makes it easy to distinguish between smooth and sharp transitions. Note: If `scipy` is not installed, the phase-field will have a sharp interface because no gaussian filter was applied above. This is not realistic. However, it still shows the possibilities of the visualization functions here.
+
+# %%
+plt.scalar_field_alpha_value(data[..., 0], color='b')
+plt.scalar_field_alpha_value(data[..., 1], color='r')
+plt.scalar_field_alpha_value(data[..., 2], color='k')
+
+# %% [markdown]
+# To see all existing phases the `phase_plot` function uses this alpha-value representation for all phases.
+
+# %%
+plt.phase_plot(data)
+
+# %% [markdown]
+# Another option to display each field separately in a subplot
+
+# %%
+plt.multiple_scalar_fields(data)
--- a/docs/source/tutorials/demo_wave_equation.py
+++ b/docs/source/tutorials/demo_wave_equation.py
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.7
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% nbsphinx="hidden"
+import psutil
+
+from pystencils.session import *
+
+import shutil
+
+# %% [markdown]
+# # Demo: Finite differences - 2D wave equation
+#
+# In this tutorial we show how to use the finite difference module of *pystencils* to solve a 2D wave equations. The time derivative is discretized by a simple forward Euler method.
+
+# %% [markdown]
+# $$ \frac{\partial^2 u}{\partial t^2} = \mbox{div} \left( q(x,y) \nabla u \right)$$
+
+# %% [markdown]
+# We begin by creating three *numpy* arrays for the current, the previous and the next timestep. Actually we will see later that two fields are enough, but let's keep it simple. From these *numpy* arrays we create *pystencils* fields to formulate our update rule.
+
+# %%
+size = (60, 70) #  domain size
+u_arrays = [np.zeros(size), np.zeros(size), np.zeros(size)]
+
+u_fields = [ps.Field.create_from_numpy_array("u%s" % (name,), arr)
+            for name, arr in zip(["0", "1", "2"], u_arrays)]
+
+# Nicer display for fields
+for i, field in enumerate(u_fields):
+    field.latex_name = "u^{(%d)}" % (i,)
+
+# %% [markdown]
+# *pystencils* contains already simple rules to discretize the a diffusion term. The time discretization is done manually.
+
+# %%
+discretize = ps.fd.Discretization2ndOrder()
+
+def central2nd_time_derivative(fields):
+    f_next, f_current, f_last = fields
+    return (f_next[0, 0] - 2 * f_current[0, 0] + f_last[0, 0]) / discretize.dt**2
+
+rhs = ps.fd.diffusion(u_fields[1], 1)
+
+wave_eq = sp.Eq(central2nd_time_derivative(u_fields), discretize(rhs))
+
+wave_eq = sp.simplify(wave_eq)
+wave_eq
+
+# %% [markdown]
+# The explicit Euler scheme is now obtained by solving above equation with respect to $u_C^{next}$.
+
+# %%
+u_next_C = u_fields[-1][0, 0]
+update_rule = ps.Assignment(u_next_C, sp.solve(wave_eq, u_next_C)[0])
+update_rule
+
+# %% [markdown]
+# Before creating the kernel, we substitute numeric values for $dx$ and $dt$. 
+# Then a kernel is created just like in the last tutorial.
+
+# %%
+update_rule = update_rule.subs({discretize.dx: 0.1, discretize.dt: 0.05})
+ast = ps.create_kernel(update_rule)
+kernel = ast.compile()
+
+ps.show_code(ast)
+
+# %%
+ast.get_parameters()
+
+# %%
+ast.fields_accessed
+
+# %% [markdown]
+# To run simulation a suitable initial condition and boundary treatment is required. We chose an initial condition which is zero at the borders of the domain. The outermost layer is not changed by the update kernel, so we have an implicit homogenous Dirichlet boundary condition.
+
+# %%
+X,Y = np.meshgrid( np.linspace(0, 1, size[1]), np.linspace(0,1, size[0]))
+Z = np.sin(2*X*np.pi) * np.sin(2*Y*np.pi)
+
+# Initialize the previous and current values with the initial function
+np.copyto(u_arrays[0], Z)
+np.copyto(u_arrays[1], Z)
+# The values for the next timesteps do not matter, since they are overwritten
+u_arrays[2][:, :] = 0
+
+
+# %% [markdown]
+# One timestep now consists of applying the kernel once, then shifting the arrays.
+
+# %%
+def run(timesteps=1):
+    for t in range(timesteps):
+        kernel(u0=u_arrays[0], u1=u_arrays[1], u2=u_arrays[2])
+        u_arrays[0], u_arrays[1], u_arrays[2] = u_arrays[1], u_arrays[2], u_arrays[0]
+    return u_arrays[2]
+
+
+# %% [markdown]
+# Lets create an animation of the solution:
+
+# %%
+if shutil.which("ffmpeg") is not None:
+    ani = plt.surface_plot_animation(run, zlim=(-1, 1))
+    ps.jupyter.display_as_html_video(ani)
+else:
+    print("No ffmpeg installed")
+
+# %%
+assert np.isfinite(np.max(u_arrays[2]))
+
+# %% [markdown]
+# __Runing on GPU__
+#
+# We can also run the same kernel on the GPU, by using the ``cupy`` package.
+
+# %%
+try:
+    import cupy
+except ImportError:
+    cupy=None
+    print('No cupy installed')
+
+
+res = None
+if cupy:
+    gpu_ast = ps.create_kernel(update_rule, target=ps.Target.GPU)
+    gpu_kernel = gpu_ast.compile()
+    res = ps.show_code(gpu_ast)
+res
+
+# %% [markdown]
+# The run function has to be changed now slightly, since the data has to be transfered to the GPU first, then the kernel can be executed, and in the end the data has to be transfered back
+
+# %%
+if cupy:
+    def run_on_gpu(timesteps=1):
+        # Transfer arrays to GPU
+        gpuArrs = [cupy.asarray(cpu_array) for cpu_array in u_arrays]
+
+        for t in range(timesteps):
+            gpu_kernel(u0=gpuArrs[0], u1=gpuArrs[1], u2=gpuArrs[2])
+            gpuArrs[0], gpuArrs[1], gpuArrs[2] = gpuArrs[1], gpuArrs[2], gpuArrs[0]
+
+        # Transfer arrays to CPU
+        for gpuArr, cpuArr in zip(gpuArrs, u_arrays):
+            cpuArr[:] = gpuArr.get()
+assert np.isfinite(np.max(u_arrays[2]))            
+
+# %%
+if cupy:
+    run_on_gpu(400)
--- a/doc/sphinx/tutorials.rst
+++ b/doc/sphinx/tutorials.rst
+.. _page_tutorials:
+
 Tutorials
 =========

@@ -8,14 +10,14 @@ It is a good idea to download them and run them directly to be able to play arou
 .. toctree::
   :maxdepth: 1

-   /notebooks/01_tutorial_getting_started.ipynb
-   /notebooks/02_tutorial_basic_kernels.ipynb
-   /notebooks/03_tutorial_datahandling.ipynb
-   /notebooks/04_tutorial_advection_diffusion.ipynb
-   /notebooks/05_tutorial_phasefield_spinodal_decomposition.ipynb
-   /notebooks/06_tutorial_phasefield_dentritic_growth.ipynb
-   /notebooks/demo_assignment_collection.ipynb
-   /notebooks/demo_plotting_and_animation.ipynb
-   /notebooks/demo_derivatives.ipynb
-   /notebooks/demo_benchmark.ipynb
-   /notebooks/demo_wave_equation.ipynb
+   01_tutorial_getting_started.ipynb
+   02_tutorial_basic_kernels.ipynb
+   03_tutorial_datahandling.ipynb
+   04_tutorial_advection_diffusion.ipynb
+   05_tutorial_phasefield_spinodal_decomposition.ipynb
+   06_tutorial_phasefield_dentritic_growth.ipynb
+   demo_assignment_collection.ipynb
+   demo_plotting_and_animation.ipynb
+   demo_derivatives.ipynb
+   demo_benchmark.ipynb
+   demo_wave_equation.ipynb
--- a/docs/source/user_manual/WorkingWithTypes.md
+++ b/docs/source/user_manual/WorkingWithTypes.md
+---
+file_format: mystnb
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+# Working with Data Types
+
+This guide will demonstrate the various options that exist to customize the data types
+in generated kernels.
+Data types can be modified on different levels of granularity:
+Individual fields and symbols,
+single subexpressions,
+or the entire kernel.
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+import pystencils as ps
+import sympy as sp
+```
+
+## Changing the Default Data Types
+
+The pystencils code generator defines two default data types:
+ - The default *numeric type*, which is applied to all numerical computations that are not
+   otherwise explicitly typed; the default is `float64`.
+ - The default *index type*, which is used for all loop and field index calculations; the default is `int64`.
+
+These can be modified by setting the
+{any}`default_dtype <CreateKernelConfig.default_dtype>` and
+{any}`index_type <CreateKernelConfig.index_dtype>`
+options of the code generator configuration:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig()
+cfg.default_dtype = "float32"
+cfg.index_dtype = "int32"
+```
+
+Modifying these will change the way types for [untyped symbols](#untyped-symbols)
+and [dynamically typed expressions](#dynamic-typing) are computed.
+
+## Setting the Types of Fields and Symbols
+
+(untyped-symbols)=
+### Untyped Symbols
+
+Symbols used inside a kernel are most commonly created using
+{any}`sp.symbols <sympy.core.symbol.symbols>` or
+{any}`sp.Symbol <sympy.core.symbol.Symbol>`.
+These symbols are *untyped*; they will receive a type during code generation
+according to these rules:
+ - Free untyped symbols (i.e. symbols not defined by an assignment inside the kernel) receive the 
+   {any}`default data type <CreateKernelConfig.default_dtype>` specified in the code generator configuration.
+ - Bound untyped symbols (i.e. symbols that *are* defined in an assignment)
+   receive the data type that was computed for the right-hand side expression of their defining assignment.
+
+If you are working on kernels with homogenous data types, using untyped symbols will mostly be enough.
+
+### Explicitly Typed Symbols and Fields
+
+If you need more control over the data types in (parts of) your kernel,
+you will have to explicitly specify them.
+To set an explicit data type for a symbol, use the {any}`TypedSymbol` class of pystencils:
+
+```{code-cell} ipython3
+x_typed = ps.TypedSymbol("x", "uint32")
+x_typed, str(x_typed.dtype)
+```
+
+You can set a `TypedSymbol` to any data type provided by [the type system](#page_type_system),
+which will then be enforced by the code generator.
+
+The same holds for fields:
+When creating fields through the {any}`fields <pystencils.field.fields>` function,
+add the type to the descriptor string; for instance:
+
+```{code-cell} ipython3
+f, g = ps.fields("f(1), g(3): float32[3D]")
+str(f.dtype), str(g.dtype)
+```
+
+When using `Field.create_generic` or `Field.create_fixed_size`, on the other hand,
+you can set the data type via the `dtype` keyword argument.
+
+(dynamic-typing)=
+### Dynamically Typed Symbols and Fields
+
+Apart from explicitly setting data types,
+`TypedSymbol`s and fields can also receive a *dynamic data type* (see {any}`DynamicType`).
+There are two options:
+ - Symbols or fields annotated with {any}`DynamicType.NUMERIC_TYPE` will always receive
+   the {any}`default numeric type <CreateKernelConfig.default_dtype>` configured for the
+   code generator.
+   This is the default setting for fields
+   created through `fields`, `Field.create_generic` or `Field.create_fixed_size`.
+ - When annotated with {any}`DynamicType.INDEX_TYPE`, on the other hand, they will receive
+   the {any}`index data type <CreateKernelConfig.index_dtype>` configured for the kernel.
+
+Using dynamic typing, you can enforce symbols to receive either the standard numeric or
+index type without explicitly stating it, such that your kernel definition becomes
+independent from the code generator configuration.
+
+## Mixing Types Inside Expressions
+
+Pystencils enforces that all symbols, constants, and fields occuring inside an expression
+have the same data type.
+The code generator will never introduce implicit casts--if any type conflicts arise, it will terminate with an error.
+
+Still, there are cases where you want to combine subexpressions of different types;
+maybe you need to compute geometric information from loop counters or other integers,
+or you are doing mixed-precision numerical computations.
+In these cases, you might have to introduce explicit type casts when values move from one type context to another.
+ 
+ <!-- 2. Annotate expressions with a specific data type to ensure computations are performed in that type. 
+  TODO: See #97 (https://i10git.cs.fau.de/pycodegen/pystencils/-/issues/97)
+ -->
+
+(type_casts)=
+### Type Casts
+
+Type casts can be introduced into kernels using the {any}`tcast` symbolic function.
+It takes an expression and a data type, which is either an explicit type (see [the type system](#page_type_system))
+or a dynamic type ({any}`DynamicType`):
+
+```{code-cell} ipython3
+x, y = sp.symbols("x, y")
+expr1 = ps.tcast(x, "float32")
+expr2 = ps.tcast(3 + y, ps.DynamicType.INDEX_TYPE)
+
+str(expr1.dtype), str(expr2.dtype)
+```
+
+When a type cast occurs, pystencils will compute the type of its argument independently
+and then introduce a runtime cast to the target type.
+That target type must comply with the type computed for the outer expression,
+which the cast is embedded in.
+
+## Understanding the pystencils Type Inference System
+
+To correctly apply varying data types to pystencils kernels, it is important to understand
+how pystencils computes and propagates the data types of symbols and expressions.
+
+Type inference happens on the level of assignments.
+For each assignment $x := \mathrm{calc}(y_1, \dots, y_n)$,
+the system first attempts to compute a *unique* type for the right-hand side (RHS) $\mathrm{calc}(y_1, \dots, y_n)$.
+It searches for any subexpression inside the RHS for which a type is already known --
+these might be typed symbols
+(whose types are either set explicitly by the user,
+or have been determined from their defining assignment),
+field accesses,
+or explicitly typed expressions.
+It then attempts to apply that data type to the entire expression.
+If type conflicts occur, the process fails and the code generator raises an error.
+Otherwise, the resulting type is assigned to the left-hand side symbol $x$.
+
+:::{admonition} Developer's To Do
+It would be great to illustrate this using a GraphViz-plot of an AST,
+with nodes colored according to their data types
+:::
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+(guide_gpukernels)=
+# Pystencils for GPUs
+
+Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
+as well as just-in-time compilation and execution of CUDA kernels from within Python
+based on the [cupy] library.w
+This section's objective is to give a detailed introduction into the creation of
+GPU kernels with pystencils.
+
+## Generate, Compile and Run CUDA Kernels
+
+In order to obtain a CUDA implementation of a symbolic kernel, naught more is required
+than setting the {any}`target <CreateKernelConfig.target>` code generator option to
+{any}`Target.CUDA`:
+
+```{code-cell} ipython3
+f, g = ps.fields("f, g: float64[3D]")
+update = ps.Assignment(f.center(), 2 * g.center())
+
+cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+kernel = ps.create_kernel(update, cfg)
+
+ps.inspect(kernel)
+```
+
+The `kernel` object returned by the code generator in above snippet is an instance
+of the {py:class}`GpuKernel` class.
+It extends {py:class}`Kernel` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
+property, which tells us how many threads the kernel is expecting to be executed with:
+
+```{code-cell} ipython3
+kernel.threads_range
+```
+
+If a GPU is available and [CuPy][cupy] is installed in the current environment,
+the kernel can be compiled and run immediately.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field.
+
+:::{note}
+[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
+which operates much in the same way that [NumPy][numpy] works on CPU arrays.
+Cupy and NumPy expose nearly the same APIs for array operations;
+the difference being that CuPy allocates all its arrays on the GPU
+and performs its operations as CUDA kernels.
+Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvcc].
+In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
+and to allocate and manage the data these kernels can be executed on.
+
+For more information on CuPy, refer to [their documentation][cupy-docs].
+:::
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
+
+### Modifying the Launch Grid
+
+The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
+This object holds the kernel's launch grid configuration
+(i.e. the number of thread blocks, and the number of threads per block.)
+Pystencils specifies a default value for the block size and if possible, 
+the number of blocks is automatically inferred in order to cover the entire iteration space.
+In addition, the wrapper's interface allows us to customize the GPU launch grid,
+by manually setting both the number of threads per block, and the number of blocks on the grid:
+
+```{code-cell} ipython3
+kfunc.block_size = (16, 8, 8)
+kfunc.num_blocks = (1, 2, 2)
+```
+
+For most kernels, setting only the `block_size` is sufficient since pystencils will
+automatically compute the number of blocks;
+for exceptions to this, see [](#manual_launch_grids).
+If `num_blocks` is set manually and the launch grid thus specified is too small, only
+a part of the iteration space will be traversed by the kernel;
+similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle.
+
+(manual_launch_grids)=
+### Manual Launch Grids and Non-Cuboid Iteration Patterns
+
+In some cases, it will be unavoidable to set the launch grid size manually;
+especially if the code generator is unable to automatically determine the size of the
+iteration space.
+An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration).
+Let's set it up once more:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f = ps.fields("f: float64[2D]")
+assignments = [
+    ps.Assignment(f(0), 1)
+]
+```
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    target=ps.Target.CUDA,
+    iteration_slice=ps.make_slice[:, y:]
+)
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
+```
+
+This warns us that the threads range could not be determined automatically.
+We can disable this warning by setting `manual_launch_grid` in the GPU option category:
+
+```{code-cell}
+cfg.gpu.manual_launch_grid = True
+```
+
+Now, to execute our kernel, we have to manually specify its launch grid:
+
+```{code-cell} ipython3
+kernel.block_size = (8, 8)
+kernel.num_blocks = (2, 2)
+```
+
+This way the kernel will cover this iteration space:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+We can also observe the effect of decreasing the launch grid size:
+
+```{code-cell} ipython3
+kernel.block_size = (4, 4)
+kernel.num_blocks = (2, 3)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+Here, since there are only eight threads operating in $x$-direction, 
+and twelve threads in $y$-direction,
+only a part of the triangle is being processed.
+
+## API Reference
+
+```{eval-rst}
+.. autosummary::
+  :nosignatures:
+
+  pystencils.codegen.GpuKernel
+  pystencils.jit.gpu_cupy.CupyKernelWrapper
+```
+
+:::{admonition} Developers To Do:
+
+- Fast approximation functions
+- Fp16 on GPU
+:::
+
+
+[cupy]: https://cupy.dev "CuPy Homepage"
+[numpy]: https://numpy.org "NumPy Homepage"
+[nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver"
+[cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
--- a/docs/source/user_manual/kernelcreation.md
+++ b/docs/source/user_manual/kernelcreation.md
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+(guide_kernelcreation)=
+# Kernel Creation
+
+Once a kernel's assignments are fully assembled, they need to be passed through pystencils' code
+generation engine in order to produce the kernel's executable code.
+The goal of this chapter is to shed light on pystencils' main code generation pipeline.
+Here, we show how to invoke the code generator and discuss its various configuration options
+and their effects on the generated kernel.
+
+## Running the Code Generator
+
+The primary way to invoke the code generation engine is through the {any}`create_kernel` function.
+It takes two arguments:
+- the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
+- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.codegen.config.CreateKernelConfig>`.
+
+```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
+.. autosummary::
+  :nosignatures:
+
+  create_kernel
+  CreateKernelConfig
+```
+
+For a simple kernel, an invocation of the code generator might look like this:
+
+```{code-cell} ipython3
+# Symbol and field definitions
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+# Kernel definition
+update = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+
+# Code Generator Configuration
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.CUDA,
+  default_dtype="float32",
+  ghost_layers=1
+)
+
+kernel = ps.create_kernel(update, cfg)
+```
+
+The above snippet defines a five-point-stencil Jacobi update. A few noteworthy things are going on:
+- The target data type of the kernel is to be `float32`.
+  This is explicitly specified for the three fields `u`, `u_tmp` and `f`.
+  For the symbol `h`, this is left implicit; `h` therefore learns its data type from the `default_dtype` configuration option.
+- The target hardware for this kernel are Nvidia GPUs; this is reflected by the `target` property being set to `Target.CUDA`.
+- As the five-point stencil reads data from neighbors offset by one cell, it can not be legally executed on the outermost
+  layer of nodes of the fields' 2D arrays. Here, we ensure that these outer layers are excluded by setting `ghost_layers=1`.
+  This is not strictly necessary, since the code generator could infer that information by itself.
+
+## Inspecting the Generated Code
+
+The object returned by the code generator, here named `kernel`, is an instance of the {any}`Kernel` class.
+This object stores the kernel's name, its list of parameters, the set of fields it operates on, and its hardware target.
+Also, it of course holds the kernel itself, in the form of an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST).
+This tree can be printed out as compilable code in the target language (C++ or, in this case, CUDA),
+but holds much more information than the printed-out code string.
+When working in a Jupyter notebook, we can interactively inspect the kernel using pystencils' `inspect` function.
+This reveals a widget that allows us investigate various details about the kernel:
+- its general properties, such as name, parameters, fields, target, ...;
+- its code, represented in the *pystencils IR syntax*;
+- the same code, in native C++/CUDA syntax;
+- and a visual representation of its abstract syntax tree.
+
+```{code-cell} ipython3
+ps.inspect(kernel)
+```
+
+## Configuring the Code Generator
+
+The code generation engine can be configured using a wide range of options.
+This section aims at explaining the majority of these options,
+their interaction and effects, use cases and caveats.
+
+
+### Target Specification
+
+Pystencils supports code generation for a variety of CPU and GPU hardware.
+
+```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.target
+  Target
+
+```
+
+### Data Types
+
+To produce valid output code, the code generator has to figure out the data types of each
+symbol, expression, and assignment occuring inside a kernel.
+This happens roughly according to the following rules:
+ - **Field Accesses:** Each field has a fixed data type set at its creation, which is also applied to
+   each access to that field.
+ - **Symbols:** Symbols obtain their data types from two sources. 
+   A symbol occuring first on the left-hand side of an assignment receives the data type that
+   was inferred for the right-hand side expression of that assignment.
+   Symbols occuring first inside some expression on the right-hand side of an assignment, on the other
+   hand, receive the {any}`default_dtype <CreateKernelConfig.default_dtype>` set in the {any}`CreateKernelConfig`.
+
+We can observe this behavior by setting up a kernel including several fields with different data types:
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import tcast
+
+f = ps.fields("f: float32[2D]")
+g = ps.fields("g: float16[2D]")
+
+x, y, z = sp.symbols("x, y, z")
+
+assignments = [
+  ps.Assignment(x, 42),
+  ps.Assignment(y, f(0) + x),
+  ps.Assignment(z, g(0))
+]
+
+cfg = ps.CreateKernelConfig(
+  default_dtype="float32",
+  index_dtype="int32"
+)
+
+kernel = ps.create_kernel(assignments, cfg)
+```
+
+We can take a look at the result produced by the code generator after parsing the above kernel.
+Inspecting the internal representation of the kernel's body and loop nest,
+we see that `x` has received the `float32` type,
+which was specified via the {py:data}`default_dtype <CreateKernelConfig.default_dtype>` option.
+The symbol `y`, on the other hand, has inherited its data type `float16` from the access to the field `g`
+on its declaration's right-hand side.
+Also, we can observe that the loop counters and symbols related to the field's memory layout
+are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKernelConfig.index_dtype>`:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+ps.inspect(driver.intermediates.materialized_ispace, show_cpp=False)
+```
+
+:::{note}
+To learn more about inspecting code after different stages of the code generator, refer to [this section](#section_codegen_stages).
+:::
+
+```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.default_dtype
+  CreateKernelConfig.index_dtype
+```
+
+### The Iteration Space
+
+The *domain fields* a kernel operates on are understood to reside on a common,
+one-, two- or three-dimensional computational grid.
+The grid points may be understood as vertices or cells, depending on the application.
+When executed, the kernel performs a computation and updates values on all, or a specific subset
+of, these grid points.
+The set of points the kernel actually operates on is defined by its *iteration space*.
+
+There are three distinct options to control the iteration space in the code generator,
+only one of which can be specified at a time:
+ - The ``ghost_layers`` option allows to specify a number of layers of grid points on all
+   domain borders that should be excluded from iteration;
+ - The ``iteration_slice`` option allows to describe the iteration space using Pythonic slice objects;
+ - The ``index_field`` option can be used to realize a sparse list-based iteration by passing a special
+   *index field* which holds a list of all points that should be processed.
+
+:::{note}
+  The points within a kernel's iteration space are understood to be processed concurrently and in
+  no particular order;
+  the output of any kernel that relies on some specific iteration order is therefore undefined.
+  (When running on a GPU, all grid points might in fact be processed in perfect simultaniety!)
+:::
+
+```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.ghost_layers
+  CreateKernelConfig.iteration_slice
+  CreateKernelConfig.index_field
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+#### Specifying Ghost Layers
+
+One way to alter the iteration space is by introducing ghost layers on the domain borders.
+These layers of grid points are stripped from the iterations, and can be used to hold
+boundary values or exchange data in MPI-parallel simulations.
+
+##### Automatic Ghost Layers
+
+The easiest way to define an iteration space with ghost layers
+is to set `ghost_layers=ps.AUTO`, which is also the default
+when no iteration space options are specified.
+In this case, the code generator will examine the kernel to find the maximum range
+of its stencil -- that is, the maximum neighbor offset encountered in any field access.
+If, for instance, a neighbor node in $x$-direction with offset $k$ is accessed by the kernel,
+it cannot legally execute on the outermost $k$ layers of nodes in that direction since it would
+access memory out-of-bounds.
+Therefore, an automatic number of $k$ ghost layers at each domain border is inferred.
+As we can see in the example below, the number of inferred ghost layers at each domain border will be set to the maximum required in any dimension.
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+u, v = ps.fields("u, v: [2D]")
+```
+
+To illustrate, the following kernel accesses neighbor nodes with a maximum offset of two:
+
+```{code-cell} ipython3
+ranged_update = ps.Assignment(u.center(), v[-2, -1] + v[2, 1])
+
+cfg = ps.CreateKernelConfig(ghost_layers=ps.AUTO)
+kernel = ps.create_kernel(ranged_update, cfg)
+```
+
+With `ghost_layers=ps.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded).
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+f = ps.fields("f: float64[2D]")
+assignments = [
+    ranged_update,
+    ps.Assignment(f(0), 1)
+]
+kernel = ps.create_kernel(assignments).compile()
+
+f_arr = np.zeros((16, 16))
+u_arr = np.zeros_like(f_arr)
+v_arr = np.zeros_like(f_arr)
+
+kernel(f=f_arr, u=u_arr, v=v_arr)
+
+_draw_ispace(f_arr)
+```
+
+##### Uniform and Nonuniform Ghost Layers
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _show_ispace(cfg):
+    f = ps.fields("f: float64[2D]")
+    assignments = [
+        ps.Assignment(f(0), 1)
+    ]
+    kernel = ps.create_kernel(assignments, cfg).compile()
+
+    f_arr = np.zeros((16, 16))
+    kernel(f=f_arr)
+
+    _draw_ispace(f_arr)
+```
+
+Setting `ghost_layers` to a number will remove that many layers from the iteration space in each dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=1
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+Ghost layers can also be specified individually for each dimension and lower/upper borders,
+by passing a sequence with either a single integer or a pair of integers per dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=[(2, 1), 3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+#### Iteration Slices
+
+Using the `iteration_slice` option, we can assert much finer control on the kernel's iteration space
+by specifying it using sequences of Python {py:class}`slice` objects.
+
+We can quickly create those using `ps.make_slice`, using the `start:stop:step` slice notation.
+The easiest case is to set the iteration space with fixed numerical limits:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[3:-4, 9:14]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+##### Strided Iteration
+
+It is also possible to set up a strided iteration that skips over a fixed number of elements.
+The following example processes only every second line in $y$-direction, using the slice `::2`:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[::2, 3:-3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+(example_triangular_iteration)=
+##### Triangular Iteration
+
+Iteration slices are not limited to constant numerical values; they can be arbitrarily complex
+*SymPy* expressions.
+By using the counter symbol for the first dimension to define the iteration limits of the second,
+we can produce a triangular iteration pattern:
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, y:]
+)
+```
+
+:::{warning}
+This type of dependency is restricted by the ordering of the iteration space dimensions:
+The limits of a dimension can only depend on the counters of dimensions that are *slower*
+than itself.
+The ordering of dimensions is determined by the memory layout of the kernels' fields;
+see also the [section on memory layouts](#section_memory_layout).
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+##### Red-Black Iteration
+
+Using a case distinction for the second dimension's start index, we can even produce
+a checkerboard pattern, as required for e.g. red-black Gauss-Seidel-type smoothers.
+We use the integer remainder ({any}`int_rem`) to distinguish between even- and odd-numbered rows,
+set the start value accordingly using {any}`sp.Piecewise <sympy.functions.elementary.piecewise.Piecewise>`,
+and use a step size of two:
+
+$$
+  start(y)=
+    \begin{cases}
+      0 & \quad \text{if } y \; \mathrm{rem} \; 2 = 0 \\ 
+      1 & \quad \text{otherwise}
+    \end{cases}
+$$
+
+
+```{code-cell} ipython3
+from pystencils.sympyextensions.integer_functions import int_rem
+
+y = ps.DEFAULTS.spatial_counters[0]
+start = sp.Piecewise(
+    (0, sp.Eq(int_rem(y, 2), 0)),
+    (1, True)
+)
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, start::2]
+)
+```
+
+:::{warning}
+The restrictions on dimension ordering of the triangular iteration example apply
+to the checkerboard-iteration as well.
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+(section_memory_layout)=
+## Memory Layout and Dimension Ordering
+
+:::{admonition} Developer To Do
+Briefly explain about field memory layouts, cache locality, coalesced memory accesses (on GPU and vector CPUs),
+and the need for correct ordering of the dimensions (loop order on CPU, thread indexing order on GPU).
+:::
+
+(section_codegen_stages)=
+## Advanced: Understanding the Stages of the Code Generator
+
+While translating a set of symbolic definitions to a kernel, the code generator of pystencils
+goes through a number of stages, gradually extending and transforming the AST.
+Pystencils allows you to retrieve and inspect the intermediate results produced by the
+code generator, in order to better understand the process of kernel translation.
+This can be immensely helpful when tracking down bugs or trying to explain unexpected
+output code.
+
+To get access to the intermediate results, the code generator has to be invoked in a slightly different way.
+Instead of just calling `create_kernel`, we directly create the so-called *driver* and instruct it to
+store its intermediate ASTs:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX512,
+  default_dtype="float32",
+)
+cfg.cpu.openmp.enable = True
+cfg.cpu.vectorize.enable = True
+cfg.cpu.vectorize.assume_inner_stride_one = True
+
+assignments = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+```
+
+```{code-cell} ipython3
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+ps.inspect(driver.intermediates)
+```
--- a/docs/source/user_manual/symbolic_language.rst
+++ b/docs/source/user_manual/symbolic_language.rst
+.. _page_symbolic_language:
+
+*****************
+Symbolic Language
+*****************
+
+Pystencils allows you to define near-arbitrarily complex numerical kernels in its symbolic
+language, which is based on the computer algebra system `SymPy <https://www.sympy.org>`_.
+The pystencils code generator is able to parse and translate a large portion of SymPy's
+symbolic expression toolkit, and furthermore extends it with its own features.
+Among the supported SymPy features are: symbols, constants, arithmetic and logical expressions,
+trigonometric and most transcendental functions, as well as piecewise definitions.
+
+Symbols
+=======
+
+Mathematical variables are generally represented using `sympy.Symbol <sympy.core.symbol.Symbol>`.
+
+Fields
+======
+
+The most important extension to SymPy brought by pystencils are *fields*.
+Fields are a symbolic representation of multidimensional cartesian numerical arrays,
+as used in many stencil algorithms.
+They are represented by the `Field` class.
+Fields can be created from a textual description using the `fields <pystencils.field.fields>` function,
+or, more concisely, using the factory methods `Field.create_generic` and `Field.create_fixed_size`.
+It is also possible to create a field representing an existing numpy array,
+including its shape, data type, and memory layout, using `Field.create_from_numpy_array`.
+
+.. autosummary::
+    :nosignatures:
+
+    pystencils.Field
+
+
+Assignments and Assignment Collections
+======================================
+
+Pystencils relies heavily on SymPy's `Assignment <sympy.codegen.ast.Assignment>` class.
+Assignments are the fundamental components of pystencils kernels;
+they are used both for assigning expressions to symbols
+and for writing values to fields.
+
+Assignments are combined and structured inside `assignment collections <pystencils.AssignmentCollection>`.
+An assignment collection contains two separate lists of assignments:
+
+- The **subexpressions** list contains assignments to symbols which can be reused in all subsequent assignments.
+  These are typically used to structure computations into parts
+  by precomputing (common) subexpressions
+- The **main assignments** represent the actual effect of the kernel by storing the computation's results
+  into fields.
+
+.. autosummary::
+    :nosignatures:
+
+    pystencils.Assignment
+    pystencils.AssignmentCollection
+
+
+Restrictions on SymPy Expressions
+=================================
+
+In order to produce valid C code, the *pystencils* code generator places some restrictions 
+on the SymPy expressions it consumes.
+
+Piecewise Definitions
+---------------------
+
+Pystencils can parse and translate piecewise function definitions using
+`sympy.Piecewise <sympy.functions.elementary.piecewise.Piecewise>`
+*only if* they have a default case.
+Any incomplete piecewise definition, such as the following, will result in an error from pystencils:
+
+.. code-block:: Python
+
+    sp.Piecewise((0, x < 0), (1, sp.And(x >= 0, x < 1)))
+
+To avoid this, you may explicitly mark the final branch as the default case by
+setting its condition to ``True``:
+
+.. code-block:: Python
+
+    sp.Piecewise((0, x < 0), (1, True))
+
+This is not always necessary; if SymPy can prove that the range of possible values is covered completely,
+it might simplify the final condition to ``True`` automatically.
+
+Integer Operations
+==================
+
+Division and Remainder
+----------------------
+
+Care has to be taken when working with integer division operations in pystencils.
+The python operators ``//`` and ``%`` work differently from their counterparts in the C family of languages.
+Where in C, integer division always rounds toward zero, ``//`` performs a floor-divide (or euclidean division)
+which rounds toward negative infinity.
+These two operations differ whenever one of the operands is negative.
+Accordingly, in Python ``a % b`` returns the *euclidean modulus*,
+while C ``a % b`` computes the *remainder* of division.
+The euclidean modulus is always nonnegative, while the remainder, if nonzero, always has the same sign as ``a``.
+
+When ``//`` and ``%`` occur in symbolic expressions given to pystencils, they are interpreted the Python-way.
+This can lead to inefficient generated code, since Pythonic integer division does not map to the corresponding C
+operators.
+To achieve C behaviour (and efficient code), you can use
+`pystencils.symb.int_div <pystencils.sympyextensions.integer_functions.int_div>` and
+`pystencils.symb.int_rem <pystencils.sympyextensions.integer_functions.int_rem>`
+which translate to C ``/`` and ``%``, respectively.
+
+When expressions are translated in an integer type context, the Python ``/`` operator
+will also be converted to C-style ``/`` integer division.
+Still, use of ``/`` for integers is discouraged, as it is designed to return a floating-point value in Python.
--- a/mypy.ini
+++ b/mypy.ini
+[mypy]
+python_version = 3.10
+exclude = "src/pystencils/old"
+
+[mypy-pystencils.*]
+ignore_errors=true
+
+[mypy-pystencils.backend.*]
+ignore_errors = False
+
+[mypy-pystencils.types.*]
+ignore_errors = False
+
+[mypy-pystencils.codegen.*]
+ignore_errors = False
+
+[mypy-pystencils.jit.*]
+ignore_errors = False
+
+[mypy-pystencils.field]
+ignore_errors = False
+
+[mypy-pystencils.sympyextensions.typed_sympy]
+ignore_errors = False
+
+[mypy-setuptools.*]
+ignore_missing_imports=true
+
+[mypy-appdirs.*]
+ignore_missing_imports=true
+
+[mypy-islpy.*]
+ignore_missing_imports=true
+
+[mypy-cupy.*]
+ignore_missing_imports=true
+
+[mypy-cpuinfo.*]
+ignore_missing_imports=true
+
+[mypy-fasteners.*]
+ignore_missing_imports=true
--- a/noxfile.py
+++ b/noxfile.py
+from __future__ import annotations
+from typing import Sequence
+
+import os
+import nox
+import subprocess
+import re
+
+nox.options.sessions = ["lint", "typecheck", "testsuite"]
+
+
+def get_cuda_version(session: nox.Session) -> None | tuple[int, ...]:
+    query_args = ["nvcc", "--version"]
+
+    try:
+        query_result = subprocess.run(query_args, capture_output=True)
+    except FileNotFoundError:
+        return None
+
+    matches = re.findall(r"release \d+\.\d+", str(query_result.stdout))
+    if matches:
+        match = matches[0]
+        version_string = match.split()[-1]
+        try:
+            return tuple(int(v) for v in version_string.split("."))
+        except ValueError:
+            pass
+
+    session.warn("nvcc was found, but I am unable to determine the CUDA version.")
+    return None
+
+
+def install_cupy(
+    session: nox.Session, cupy_version: str, skip_if_no_cuda: bool = False
+):
+    if cupy_version is not None:
+        cuda_version = get_cuda_version(session)
+        if cuda_version is None or cuda_version[0] not in (11, 12):
+            if skip_if_no_cuda:
+                session.skip(
+                    "No compatible installation of CUDA found - Need either CUDA 11 or 12"
+                )
+            else:
+                session.warn(
+                    "Running without cupy: no compatbile installation of CUDA found. Need either CUDA 11 or 12."
+                )
+                return
+
+        cuda_major = cuda_version[0]
+        cupy_package = f"cupy-cuda{cuda_major}x=={cupy_version}"
+        session.install(cupy_package)
+
+
+def check_external_doc_dependencies(session: nox.Session):
+    dot_args = ["dot", "--version"]
+    try:
+        _ = subprocess.run(dot_args, capture_output=True)
+    except FileNotFoundError:
+        session.error(
+            "Unable to build documentation: "
+            "Command `dot` from the `graphviz` package (https://www.graphviz.org/) is not available"
+        )
+
+
+def editable_install(session: nox.Session, opts: Sequence[str] = ()):
+    if opts:
+        opts_str = "[" + ",".join(opts) + "]"
+    else:
+        opts_str = ""
+    session.install("-e", f".{opts_str}")
+
+
+@nox.session(python="3.10", tags=["qa", "code-quality"])
+def lint(session: nox.Session):
+    """Lint code using flake8"""
+
+    session.install("flake8")
+    session.run("flake8", "src/pystencils")
+
+
+@nox.session(python="3.10", tags=["qa", "code-quality"])
+def typecheck(session: nox.Session):
+    """Run MyPy for static type checking"""
+    editable_install(session)
+    session.install("mypy")
+    session.run("mypy", "src/pystencils")
+
+
+@nox.session(python=["3.10", "3.11", "3.12", "3.13"], tags=["test"])
+@nox.parametrize("cupy_version", [None, "12", "13"], ids=["cpu", "cupy12", "cupy13"])
+def testsuite(session: nox.Session, cupy_version: str | None):
+    """Run the pystencils test suite.
+    
+    **Positional Arguments:** Any positional arguments passed to nox after `--`
+    are propagated to pytest.
+    """
+
+    if cupy_version is not None:
+        install_cupy(session, cupy_version, skip_if_no_cuda=True)
+
+    editable_install(session, ["alltrafos", "use_cython", "interactive", "testsuite"])
+
+    num_cores = os.cpu_count()
+
+    session.run(
+        "pytest",
+        "-v",
+        "-n",
+        str(num_cores),
+        "--cov-report=term",
+        "--cov=.",
+        "-m",
+        "not longrun",
+        "--html",
+        "test-report/index.html",
+        "--junitxml=report.xml",
+        *session.posargs
+    )
+    session.run("coverage", "html")
+    session.run("coverage", "xml")
+
+
+@nox.session(python=["3.10"], tags=["docs"])
+def docs(session: nox.Session):
+    """Build the documentation pages"""
+    check_external_doc_dependencies(session)
+    install_cupy(session, "12.3")
+    editable_install(session, ["doc"])
+
+    env = {}
+
+    session_args = session.posargs
+    if "--fail-on-warnings" in session_args:
+        env["SPHINXOPTS"] = "-W --keep-going"
+
+    session.chdir("docs")
+
+    if "--clean" in session_args:
+        session.run("make", "clean", external=True)
+
+    session.run("make", "html", external=True, env=env)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 ]
 license = { file = "COPYING.txt" }
 requires-python = ">=3.10"
-dependencies = ["sympy>=1.6,<=1.11.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml"]
+dependencies = ["sympy>=1.9,<=1.12.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml", "pybind11", "fasteners"]
 classifiers = [
    "Development Status :: 4 - Beta",
    "Framework :: Jupyter",
@@ -44,34 +44,45 @@ interactive = [
 use_cython = [
    'Cython'
 ]
+dev = [
+    "flake8",
+    "mypy",
+    "black",
+]
 doc = [
    'sphinx',
-    'sphinx_rtd_theme',
-    'nbsphinx',
+    'pydata-sphinx-theme==0.15.4',
+    'sphinx-book-theme==1.1.3',  # workaround for https://github.com/executablebooks/sphinx-book-theme/issues/865
    'sphinxcontrib-bibtex',
    'sphinx_autodoc_typehints',
    'pandoc',
+    'sphinx_design',
+    'myst-nb',
+    'matplotlib',
+    'ipywidgets',
+    'graphviz',
 ]
-tests = [
+testsuite = [
    'pytest',
    'pytest-cov',
    'pytest-html',
    'ansi2html',
    'pytest-xdist',
    'flake8',
+    'mypy>=1.8',
    'nbformat',
    'nbconvert',
    'ipython',
    'matplotlib',
    'py-cpuinfo',
    'randomgen>=1.18',
+    'scipy'
 ]

 [build-system]
 requires = [
    "setuptools>=61",
-    "versioneer>=0.29",
-    "tomli; python_version < '3.11'",
+    "versioneer[toml]>=0.29",
    # 'Cython'
 ]
 build-backend = "setuptools.build_meta"
@@ -79,6 +90,7 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.package-data]
 pystencils = [
    "include/*.h",
+    "jit/cpu/*.tmpl.cpp",
    "boundaries/createindexlistcython.pyx"
 ]


--- a/pytest.ini
+++ b/pytest.ini
 [pytest]
-testpaths = src tests doc/notebooks
+testpaths = src tests docs/source/tutorials
 pythonpath = src
 python_files = test_*.py *_test.py scenario_*.py
 norecursedirs = *.egg-info .git .cache .ipynb_checkpoints htmlcov
-addopts = --doctest-modules --durations=20  --cov-config pytest.ini
+addopts = 
+       --doctest-modules
+       --durations=20 
+       --cov-config pytest.ini
+       --ignore=src/pystencils/old
+       --ignore=tests/_old
+       --ignore=tests/_todo
 markers =
       longrun: tests only run at night since they have large execution time
       notebook: mark for notebooks
@@ -17,6 +23,7 @@ filterwarnings =
       ignore:Using or importing the ABCs from 'collections' instead of from 'collections.abc':DeprecationWarning
       ignore:Animation was deleted without rendering anything:UserWarning

+# Coverage Configuration
 [run]
 branch = True
 source = src/pystencils
@@ -25,6 +32,7 @@ source = src/pystencils
 omit = doc/*
       tests/*
       setup.py
+       noxfile.py
       quicktest.py
       conftest.py
       versioneer.py
@@ -34,6 +42,8 @@ omit = doc/*
       src/pystencils/cache.py
       src/pystencils/pacxx/benchmark.py
       src/pystencils/_version.py
+       src/pystencils/_deprecation.py
+       src/pystencils/old
       venv/

 [report]
@@ -55,8 +65,11 @@ exclude_lines =
       if False:
       if __name__ == .__main__.:

+       # Don't cover type checking imports
+       if TYPE_CHECKING:
+
 skip_covered = True
-fail_under = 85
+fail_under = 80

 [html]
 directory = coverage_report
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
-from .enums import Backend, Target
+
+from .codegen import (
+    Target,
+    CreateKernelConfig,
+    AUTO
+)
+from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
-from .assignment import Assignment, AddAugmentedAssignment, assignment_from_stencil
-from .typing.typed_sympy import TypedSymbol
 from .display_utils import get_code_obj, get_code_str, show_code, to_dot
+from .inspection import inspect
 from .field import Field, FieldType, fields
-from .config import CreateKernelConfig
+from .types import create_type, create_numeric_type
 from .cache import clear_cache
 from .kernel_decorator import kernel, kernel_config
-from .kernelcreation import create_kernel, create_staggered_kernel
-from .simp import AssignmentCollection
+from .codegen.driver import create_kernel, create_staggered_kernel
+from .codegen import Kernel
+from .jit import no_jit
+from .backend.exceptions import KernelConstraintsError
 from .slicing import make_slice
-from .spatial_coordinates import x_, x_staggered, x_staggered_vector, x_vector, y_, y_staggered, z_, z_staggered
-from .sympyextensions import SymbolCreator
+from .spatial_coordinates import (
+    x_,
+    x_staggered,
+    x_staggered_vector,
+    x_vector,
+    y_,
+    y_staggered,
+    z_,
+    z_staggered,
+)
+from .assignment import Assignment, AddAugmentedAssignment, assignment_from_stencil
+from .simp import AssignmentCollection
+from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
+from .sympyextensions import SymbolCreator, tcast
 from .datahandling import create_data_handling

-__all__ = ['Field', 'FieldType', 'fields',
-           'TypedSymbol',
-           'make_slice',
-           'CreateKernelConfig',
-           'create_kernel', 'create_staggered_kernel',
-           'Target', 'Backend',
-           'show_code', 'to_dot', 'get_code_obj', 'get_code_str',
-           'AssignmentCollection',
-           'Assignment', 'AddAugmentedAssignment',
-           'assignment_from_stencil',
-           'SymbolCreator',
-           'create_data_handling',
-           'clear_cache',
-           'kernel', 'kernel_config',
-           'x_', 'y_', 'z_',
-           'x_staggered', 'y_staggered', 'z_staggered',
-           'x_vector', 'x_staggered_vector',
-           'fd',
-           'stencil']
-
-from ._version import get_versions
+__all__ = [
+    "Field",
+    "FieldType",
+    "fields",
+    "DEFAULTS",
+    "TypedSymbol",
+    "DynamicType",
+    "create_type",
+    "create_numeric_type",
+    "make_slice",
+    "CreateKernelConfig",
+    "AUTO",
+    "create_kernel",
+    "create_staggered_kernel",
+    "Kernel",
+    "KernelConstraintsError",
+    "Target",
+    "no_jit",
+    "show_code",
+    "to_dot",
+    "get_code_obj",
+    "get_code_str",
+    "inspect",
+    "AssignmentCollection",
+    "Assignment",
+    "AddAugmentedAssignment",
+    "assignment_from_stencil",
+    "SymbolCreator",
+    "create_data_handling",
+    "clear_cache",
+    "kernel",
+    "kernel_config",
+    "x_",
+    "y_",
+    "z_",
+    "x_staggered",
+    "y_staggered",
+    "z_staggered",
+    "x_vector",
+    "x_staggered_vector",
+    "fd",
+    "stencil",
+    "tcast",
+]

-__version__ = get_versions()['version']
-del get_versions
+from . import _version
+__version__ = _version.get_versions()['version']
--- a/src/pystencils/_deprecation.py
+++ b/src/pystencils/_deprecation.py
+def _deprecated(feature, instead, version="2.1"):
+    from warnings import warn
+
+    warn(
+        f"{feature} is deprecated and will be removed in pystencils {version}."
+        f"Use {instead} instead.",
+        DeprecationWarning,
+        stacklevel=2
+    )
--- a/src/pystencils/_version.py
+++ b/src/pystencils/_version.py
@@ -5,8 +5,9 @@
 # directories (produced by setup.py build) will contain a much shorter file
 # that just contains the computed version number.

-# This file is released into the public domain. Generated by
-# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+# This file is released into the public domain.
+# Generated by versioneer-0.29
+# https://github.com/python-versioneer/python-versioneer

 """Git implementation of _version.py."""

@@ -15,9 +16,11 @@ import os
 import re
 import subprocess
 import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import functools


-def get_keywords():
+def get_keywords() -> Dict[str, str]:
    """Get the keywords needed to look up the version information."""
    # these strings will be replaced by git during git-archive.
    # setup.py/versioneer.py will grep for the variable names, so they must
@@ -33,8 +36,15 @@ def get_keywords():
 class VersioneerConfig:
    """Container for Versioneer configuration parameters."""

+    VCS: str
+    style: str
+    tag_prefix: str
+    parentdir_prefix: str
+    versionfile_source: str
+    verbose: bool

-def get_config():
+
+def get_config() -> VersioneerConfig:
    """Create, populate and return the VersioneerConfig() object."""
    # these strings are filled in when 'setup.py versioneer' creates
    # _version.py
@@ -43,7 +53,7 @@ def get_config():
    cfg.style = "pep440"
    cfg.tag_prefix = "release/"
    cfg.parentdir_prefix = "pystencils-"
-    cfg.versionfile_source = "pystencils/_version.py"
+    cfg.versionfile_source = "src/pystencils/_version.py"
    cfg.verbose = False
    return cfg

@@ -52,13 +62,13 @@ class NotThisMethod(Exception):
    """Exception raised if a method is not valid for the current scenario."""


-LONG_VERSION_PY = {}
-HANDLERS = {}
+LONG_VERSION_PY: Dict[str, str] = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}


-def register_vcs_handler(vcs, method):  # decorator
+def register_vcs_handler(vcs: str, method: str) -> Callable:  # decorator
    """Create decorator to mark a method as the handler of a VCS."""
-    def decorate(f):
+    def decorate(f: Callable) -> Callable:
        """Store f in HANDLERS[vcs][method]."""
        if vcs not in HANDLERS:
            HANDLERS[vcs] = {}
@@ -67,22 +77,35 @@ def register_vcs_handler(vcs, method):  # decorator
    return decorate


-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(
+    commands: List[str],
+    args: List[str],
+    cwd: Optional[str] = None,
+    verbose: bool = False,
+    hide_stderr: bool = False,
+    env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
    """Call the given command(s)."""
    assert isinstance(commands, list)
-    p = None
-    for c in commands:
+    process = None
+
+    popen_kwargs: Dict[str, Any] = {}
+    if sys.platform == "win32":
+        # This hides the console window if pythonw.exe is used
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        popen_kwargs["startupinfo"] = startupinfo
+
+    for command in commands:
        try:
-            dispcmd = str([c] + args)
+            dispcmd = str([command] + args)
            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            process = subprocess.Popen([command] + args, cwd=cwd, env=env,
+                                       stdout=subprocess.PIPE,
+                                       stderr=(subprocess.PIPE if hide_stderr
+                                               else None), **popen_kwargs)
            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
+        except OSError as e:
            if e.errno == errno.ENOENT:
                continue
            if verbose:
@@ -93,16 +116,20 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
        if verbose:
            print("unable to find command, tried %s" % (commands,))
        return None, None
-    stdout = p.communicate()[0].strip().decode()
-    if p.returncode != 0:
+    stdout = process.communicate()[0].strip().decode()
+    if process.returncode != 0:
        if verbose:
            print("unable to run %s (error)" % dispcmd)
            print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
+        return None, process.returncode
+    return stdout, process.returncode


-def versions_from_parentdir(parentdir_prefix, root, verbose):
+def versions_from_parentdir(
+    parentdir_prefix: str,
+    root: str,
+    verbose: bool,
+) -> Dict[str, Any]:
    """Try to determine the version from the parent directory name.

    Source tarballs conventionally unpack into a directory that includes both
@@ -111,15 +138,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
    """
    rootdirs = []

-    for i in range(3):
+    for _ in range(3):
        dirname = os.path.basename(root)
        if dirname.startswith(parentdir_prefix):
            return {"version": dirname[len(parentdir_prefix):],
                    "full-revisionid": None,
                    "dirty": False, "error": None, "date": None}
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
+        rootdirs.append(root)
+        root = os.path.dirname(root)  # up a level

    if verbose:
        print("Tried directories %s but none started with prefix %s" %
@@ -128,39 +154,42 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):


 @register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
    """Extract version information from the given file."""
    # the code embedded in _version.py can just fetch the value of these
    # keywords. When used from setup.py, we don't want to import _version.py,
    # so we do it with a regexp instead. This function is not used from
    # _version.py.
-    keywords = {}
+    keywords: Dict[str, str] = {}
    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
+        with open(versionfile_abs, "r") as fobj:
+            for line in fobj:
+                if line.strip().startswith("git_refnames ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["refnames"] = mo.group(1)
+                if line.strip().startswith("git_full ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["full"] = mo.group(1)
+                if line.strip().startswith("git_date ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["date"] = mo.group(1)
+    except OSError:
        pass
    return keywords


 @register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
+def git_versions_from_keywords(
+    keywords: Dict[str, str],
+    tag_prefix: str,
+    verbose: bool,
+) -> Dict[str, Any]:
    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
+    if "refnames" not in keywords:
+        raise NotThisMethod("Short version file found")
    date = keywords.get("date")
    if date is not None:
        # Use only the last line.  Previous lines may contain GPG signature
@@ -179,11 +208,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        if verbose:
            print("keywords are unexpanded, not using")
        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    refs = {r.strip() for r in refnames.strip("()").split(",")}
    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
    if not tags:
        # Either we're using git < 1.8.3, or there really are no tags. We use
        # a heuristic: assume all version tags have a digit. The old git %d
@@ -192,7 +221,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        # between branches and tags. By ignoring refnames without digits, we
        # filter out many common branch names like "release" and
        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = {r for r in refs if re.search(r'\d', r)}
        if verbose:
            print("discarding '%s', no digits" % ",".join(refs - tags))
    if verbose:
@@ -201,6 +230,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        # sorting will prefer e.g. "2.0" over "2.0rc1"
        if ref.startswith(tag_prefix):
            r = ref[len(tag_prefix):]
+            # Filter out refs that exactly match prefix or that don't start
+            # with a number once the prefix is stripped (mostly a concern
+            # when prefix is '')
+            if not re.match(r'\d', r):
+                continue
            if verbose:
                print("picking %s" % r)
            return {"version": r,
@@ -216,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):


 @register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+def git_pieces_from_vcs(
+    tag_prefix: str,
+    root: str,
+    verbose: bool,
+    runner: Callable = run_command
+) -> Dict[str, Any]:
    """Get version from 'git describe' in the root of the source tree.

    This only gets called if the git-archive 'subst' keywords were *not*
@@ -227,8 +266,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    if sys.platform == "win32":
        GITS = ["git.cmd", "git.exe"]

-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    # GIT_DIR can interfere with correct operation of Versioneer.
+    # It may be intended to be passed to the Versioneer-versioned project,
+    # but that should not change where we get our version from.
+    env = os.environ.copy()
+    env.pop("GIT_DIR", None)
+    runner = functools.partial(runner, env=env)
+
+    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                   hide_stderr=not verbose)
    if rc != 0:
        if verbose:
            print("Directory %s not under git control" % root)
@@ -236,24 +282,57 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):

    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = runner(GITS, [
+        "describe", "--tags", "--dirty", "--always", "--long",
+        "--match", f"{tag_prefix}[[:digit:]]*"
+    ], cwd=root)
    # --long was added in git-1.5.5
    if describe_out is None:
        raise NotThisMethod("'git describe' failed")
    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
    if full_out is None:
        raise NotThisMethod("'git rev-parse' failed")
    full_out = full_out.strip()

-    pieces = {}
+    pieces: Dict[str, Any] = {}
    pieces["long"] = full_out
    pieces["short"] = full_out[:7]  # maybe improved later
    pieces["error"] = None

+    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
+                             cwd=root)
+    # --abbrev-ref was added in git-1.6.3
+    if rc != 0 or branch_name is None:
+        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+    branch_name = branch_name.strip()
+
+    if branch_name == "HEAD":
+        # If we aren't exactly on a branch, pick a branch which represents
+        # the current commit. If all else fails, we are on a branchless
+        # commit.
+        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+        # --contains was added in git-1.5.4
+        if rc != 0 or branches is None:
+            raise NotThisMethod("'git branch --contains' returned error")
+        branches = branches.split("\n")
+
+        # Remove the first line if we're running detached
+        if "(" in branches[0]:
+            branches.pop(0)
+
+        # Strip off the leading "* " from the list of branches.
+        branches = [branch[2:] for branch in branches]
+        if "master" in branches:
+            branch_name = "master"
+        elif not branches:
+            branch_name = None
+        else:
+            # Pick the first branch that is returned. Good or bad.
+            branch_name = branches[0]
+
+    pieces["branch"] = branch_name
+
    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
    # TAG might have hyphens.
    git_describe = describe_out
@@ -270,7 +349,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
        # TAG-NUM-gHEX
        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
+            # unparsable. Maybe git-describe is misbehaving?
            pieces["error"] = ("unable to parse git-describe output: '%s'"
                               % describe_out)
            return pieces
@@ -295,13 +374,11 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    else:
        # HEX: no tags
        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
+        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+        pieces["distance"] = len(out.split())  # total number of commits

    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
    # Use only the last line.  Previous lines may contain GPG signature
    # information.
    date = date.splitlines()[-1]
@@ -310,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    return pieces


-def plus_or_dot(pieces):
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
    """Return a + if we don't already have one, else return a ."""
    if "+" in pieces.get("closest-tag", ""):
        return "."
    return "+"


-def render_pep440(pieces):
+def render_pep440(pieces: Dict[str, Any]) -> str:
    """Build up version string, with post-release "local version identifier".

    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
@@ -342,23 +419,71 @@ def render_pep440(pieces):
    return rendered


-def render_pep440_pre(pieces):
-    """TAG[.post0.devDISTANCE] -- No -dirty.
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
+    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+
+    The ".dev0" means not master branch. Note that .dev0 sorts backwards
+    (a feature branch will appear "older" than the master branch).

    Exceptions:
-    1: no tags. 0.post0.devDISTANCE
+    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
    """
    if pieces["closest-tag"]:
        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0"
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
+    """Split pep440 version string at the post-release segment.
+
+    Returns the release segments before the post-release and the
+    post-release version number (or -1 if no post-release segment is present).
+    """
+    vc = str.split(ver, ".post")
+    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+
+
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
+    """TAG[.postN.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
        if pieces["distance"]:
-            rendered += ".post0.dev%d" % pieces["distance"]
+            # update the post release segment
+            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            rendered = tag_version
+            if post_version is not None:
+                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+            else:
+                rendered += ".post0.dev%d" % (pieces["distance"])
+        else:
+            # no commits, use the tag as the version
+            rendered = pieces["closest-tag"]
    else:
        # exception #1
        rendered = "0.post0.dev%d" % pieces["distance"]
    return rendered


-def render_pep440_post(pieces):
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
    """TAG[.postDISTANCE[.dev0]+gHEX] .

    The ".dev0" means dirty. Note that .dev0 sorts backwards
@@ -385,7 +510,36 @@ def render_pep440_post(pieces):
    return rendered


-def render_pep440_old(pieces):
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
+    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+
+    The ".dev0" means not master branch.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
    """TAG[.postDISTANCE[.dev0]] .

    The ".dev0" means dirty.
@@ -407,7 +561,7 @@ def render_pep440_old(pieces):
    return rendered


-def render_git_describe(pieces):
+def render_git_describe(pieces: Dict[str, Any]) -> str:
    """TAG[-DISTANCE-gHEX][-dirty].

    Like 'git describe --tags --dirty --always'.
@@ -427,7 +581,7 @@ def render_git_describe(pieces):
    return rendered


-def render_git_describe_long(pieces):
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
    """TAG-DISTANCE-gHEX[-dirty].

    Like 'git describe --tags --dirty --always -long'.
@@ -447,7 +601,7 @@ def render_git_describe_long(pieces):
    return rendered


-def render(pieces, style):
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
    """Render the given version pieces into the requested style."""
    if pieces["error"]:
        return {"version": "unknown",
@@ -461,10 +615,14 @@ def render(pieces, style):

    if style == "pep440":
        rendered = render_pep440(pieces)
+    elif style == "pep440-branch":
+        rendered = render_pep440_branch(pieces)
    elif style == "pep440-pre":
        rendered = render_pep440_pre(pieces)
    elif style == "pep440-post":
        rendered = render_pep440_post(pieces)
+    elif style == "pep440-post-branch":
+        rendered = render_pep440_post_branch(pieces)
    elif style == "pep440-old":
        rendered = render_pep440_old(pieces)
    elif style == "git-describe":
@@ -479,7 +637,7 @@ def render(pieces, style):
            "date": pieces.get("date")}


-def get_versions():
+def get_versions() -> Dict[str, Any]:
    """Get version information or return default if unable to do so."""
    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
    # __file__, we can work backwards from there to the root. Some
@@ -500,7 +658,7 @@ def get_versions():
        # versionfile_source is the relative path from the top of the source
        # tree (where the .git directory might live) to this file. Invert
        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
+        for _ in cfg.versionfile_source.split('/'):
            root = os.path.dirname(root)
    except NameError:
        return {"version": "0+unknown", "full-revisionid": None,

--- a/src/pystencils/alignedarray.py
+++ b/src/pystencils/alignedarray.py
+# flake8: noqa
 import numpy as np


@@ -17,10 +18,12 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
    """
    if byte_alignment is True or byte_alignment == 'cacheline':
-        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
-                                                               get_vector_instruction_set)
+        # from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
+        #                                                        get_vector_instruction_set)

-        instruction_sets = get_supported_instruction_sets()
+        # instruction_sets = get_supported_instruction_sets()
+        #   TODO `get_supported_instruction_sets` has to be reimplemented
+        instruction_sets = None
        if instruction_sets is None:
            byte_alignment = 64
        elif byte_alignment == 'cacheline':

--- a/src/pystencils/assignment.py
+++ b/src/pystencils/assignment.py
 import numpy as np
 import sympy as sp
-from sympy.codegen.ast import Assignment, AugmentedAssignment, AddAugmentedAssignment
+from sympy.codegen.ast import Assignment, AugmentedAssignment
+from sympy.codegen.ast import AddAugmentedAssignment as SpAddAugAssignment
 from sympy.printing.latex import LatexPrinter

 __all__ = ['Assignment', 'AugmentedAssignment', 'AddAugmentedAssignment', 'assignment_from_stencil']
@@ -39,6 +40,9 @@ LatexPrinter._print_AugmentedAssignment = print_assignment_latex

 sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))

+#   Re-Export
+AddAugmentedAssignment = SpAddAugAssignment
+

 def assignment_from_stencil(stencil_array, input_field, output_field,
                            normalization_factor=None, order='visual') -> Assignment:

--- a/src/pystencils/astnodes.py
+++ b/src/pystencils/astnodes.py
-import collections.abc
-import itertools
-import uuid
-from typing import Any, List, Optional, Sequence, Set, Union
-
-import sympy as sp
-
-import pystencils
-from pystencils.typing.utilities import create_type, get_next_parent_of_type
-from pystencils.enums import Target, Backend
-from pystencils.field import Field
-from pystencils.typing.typed_sympy import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol
-from pystencils.sympyextensions import fast_subs
-
-NodeOrExpr = Union['Node', sp.Expr]
-
-
-class Node:
-    """Base class for all AST nodes."""
-
-    def __init__(self, parent: Optional['Node'] = None):
-        self.parent = parent
-
-    @property
-    def args(self) -> List[NodeOrExpr]:
-        """Returns all arguments/children of this node."""
-        raise NotImplementedError()
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        """Set of symbols which are defined by this node."""
-        raise NotImplementedError()
-
-    @property
-    def undefined_symbols(self) -> Set[sp.Symbol]:
-        """Symbols which are used but are not defined inside this node."""
-        raise NotImplementedError()
-
-    def subs(self, subs_dict) -> None:
-        """Inplace! Substitute, similar to sympy's but modifies the AST inplace."""
-        for i, a in enumerate(self.args):
-            result = a.subs(subs_dict)
-            if isinstance(a, sp.Expr):  # sympy expressions' subs is out-of-place
-                self.args[i] = result
-            else:  # all other should be in-place
-                assert result is None
-
-    @property
-    def func(self):
-        return self.__class__
-
-    def atoms(self, arg_type) -> Set[Any]:
-        """Returns a set of all descendants recursively, which are an instance of the given type."""
-        result = set()
-        for arg in self.args:
-            if isinstance(arg, arg_type):
-                result.add(arg)
-            result.update(arg.atoms(arg_type))
-        return result
-
-
-class Conditional(Node):
-    """Conditional that maps to a 'if' statement in C/C++.
-
-    Try to avoid using this node inside of loops, since currently this construction can not be vectorized.
-    Consider using assignments with sympy.Piecewise in this case.
-
-    Args:
-        condition_expr: sympy relational expression
-        true_block: block which is run if conditional is true
-        false_block: optional block which is run if conditional is false
-    """
-
-    def __init__(self, condition_expr: sp.Basic, true_block: Union['Block', 'SympyAssignment'],
-                 false_block: Optional['Block'] = None) -> None:
-        super(Conditional, self).__init__(parent=None)
-
-        self.condition_expr = condition_expr
-
-        def handle_child(c):
-            if c is None:
-                return None
-            if not isinstance(c, Block):
-                c = Block([c])
-            c.parent = self
-            return c
-
-        self.true_block = handle_child(true_block)
-        self.false_block = handle_child(false_block)
-
-    def subs(self, subs_dict):
-        self.true_block.subs(subs_dict)
-        if self.false_block:
-            self.false_block.subs(subs_dict)
-        self.condition_expr = self.condition_expr.subs(subs_dict)
-
-    @property
-    def args(self):
-        result = [self.condition_expr, self.true_block]
-        if self.false_block:
-            result.append(self.false_block)
-        return result
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        result = self.true_block.undefined_symbols
-        if self.false_block:
-            result.update(self.false_block.undefined_symbols)
-        if hasattr(self.condition_expr, 'atoms'):
-            result.update(self.condition_expr.atoms(sp.Symbol))
-        return result
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        result = f'if:({self.condition_expr!r}) '
-        if self.true_block:
-            result += f'\n\t{self.true_block}) '
-        if self.false_block:
-            result = 'else: '
-            result += f'\n\t{self.false_block} '
-
-        return result
-
-    def replace_by_true_block(self):
-        """Replaces the conditional by its True block"""
-        self.parent.replace(self, [self.true_block])
-
-    def replace_by_false_block(self):
-        """Replaces the conditional by its False block"""
-        self.parent.replace(self, [self.false_block] if self.false_block else [])
-
-
-class KernelFunction(Node):
-    class Parameter:
-        """Function parameter.
-
-        Each undefined symbol in a `KernelFunction` node becomes a parameter to the function.
-        Parameters are either symbols introduced by the user that never occur on the left hand side of an
-        Assignment, or are related to fields/arrays passed to the function.
-
-        A parameter consists of the typed symbol (symbol property). For field related parameters this is a symbol
-        defined in pystencils.kernelparameters.
-        If the parameter is related to one or multiple fields, these fields are referenced in the fields property.
-        """
-
-        def __init__(self, symbol, fields):
-            self.symbol = symbol  # type: TypedSymbol
-            self.fields = fields  # type: Sequence[Field]
-
-        def __repr__(self):
-            return repr(self.symbol)
-
-        @property
-        def is_field_stride(self):
-            return isinstance(self.symbol, FieldStrideSymbol)
-
-        @property
-        def is_field_shape(self):
-            return isinstance(self.symbol, FieldShapeSymbol)
-
-        @property
-        def is_field_pointer(self):
-            return isinstance(self.symbol, FieldPointerSymbol)
-
-        @property
-        def is_field_parameter(self):
-            return self.is_field_pointer or self.is_field_shape or self.is_field_stride
-
-        @property
-        def field_name(self):
-            return self.fields[0].name
-
-    def __init__(self, body, target: Target, backend: Backend, compile_function, ghost_layers,
-                 function_name: str = "kernel",
-                 assignments=None):
-        super(KernelFunction, self).__init__()
-        self._body = body
-        body.parent = self
-        self.function_name = function_name
-        self._body.parent = self
-        self.ghost_layers = ghost_layers
-        self._target = target
-        self._backend = backend
-        # these variables are assumed to be global, so no automatic parameter is generated for them
-        self.global_variables = set()
-        self.instruction_set = None  # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
-        # function that compiles the node to a Python callable, is set by the backends
-        self._compile_function = compile_function
-        self.assignments = assignments
-        # If nontemporal stores are activated together with the Neon instruction set it results in cacheline zeroing
-        # For cacheline zeroing the information of the field size for each field is needed. Thus, in this case
-        # all field sizes are kernel parameters and not just the common field size used for the loops
-        self.use_all_written_field_sizes = False
-
-    @property
-    def target(self):
-        """See pystencils.Target"""
-        return self._target
-
-    @property
-    def backend(self):
-        """Backend for generating the code: `Backend`"""
-        return self._backend
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    @property
-    def body(self):
-        return self._body
-
-    @body.setter
-    def body(self, value):
-        self._body = value
-        self._body.parent = self
-
-    @property
-    def args(self):
-        return self._body,
-
-    @property
-    def fields_accessed(self) -> Set[Field]:
-        """Set of Field instances: fields which are accessed inside this kernel function"""
-        return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess)))
-
-    @property
-    def fields_written(self) -> Set[Field]:
-        assignments = self.atoms(SympyAssignment)
-        return set().union(itertools.chain.from_iterable([f.field for f in a.lhs.free_symbols if hasattr(f, 'field')]
-                                                         for a in assignments))
-
-    @property
-    def fields_read(self) -> Set[Field]:
-        assignments = self.atoms(SympyAssignment)
-        return set().union(itertools.chain.from_iterable([f.field for f in a.rhs.free_symbols if hasattr(f, 'field')]
-                                                         for a in assignments))
-
-    def get_parameters(self) -> Sequence['KernelFunction.Parameter']:
-        """Returns list of parameters for this function.
-
-        This function is expensive, cache the result where possible!
-        """
-        field_map = {f.name: f for f in self.fields_accessed}
-        sizes = set()
-
-        if self.use_all_written_field_sizes:
-            sizes = set().union(*(a.shape[:a.spatial_dimensions] for a in self.fields_written))
-            sizes = filter(lambda s: isinstance(s, FieldShapeSymbol), sizes)
-
-        def get_fields(symbol):
-            if hasattr(symbol, 'field_name'):
-                return field_map[symbol.field_name],
-            elif hasattr(symbol, 'field_names'):
-                return tuple(field_map[fn] for fn in symbol.field_names)
-            return ()
-
-        argument_symbols = self._body.undefined_symbols - self.global_variables
-        argument_symbols.update(sizes)
-        parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
-        if hasattr(self, 'indexing'):
-            parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
-        parameters.sort(key=lambda p: p.symbol.name)
-        return parameters
-
-    def __str__(self):
-        params = [p.symbol for p in self.get_parameters()]
-        return '{0} {1}({2})\n{3}'.format(type(self).__name__, self.function_name, params,
-                                          ("\t" + "\t".join(str(self.body).splitlines(True))))
-
-    def __repr__(self):
-        params = [p.symbol for p in self.get_parameters()]
-        return f'{type(self).__name__} {self.function_name}({params})'
-
-    def compile(self, *args, **kwargs):
-        if self._compile_function is None:
-            raise ValueError("No compile-function provided for this KernelFunction node")
-        return self._compile_function(self, *args, **kwargs)
-
-
-class SkipIteration(Node):
-    @property
-    def args(self):
-        return []
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-
-class Block(Node):
-    def __init__(self, nodes: Union[Node, List[Node]]):
-        super(Block, self).__init__()
-        if not isinstance(nodes, list):
-            nodes = [nodes]
-        self._nodes = nodes
-        self.parent = None
-        for n in self._nodes:
-            try:
-                n.parent = self
-            except AttributeError:
-                pass
-
-    @property
-    def args(self):
-        return self._nodes
-
-    def subs(self, subs_dict) -> None:
-        for a in self.args:
-            a.subs(subs_dict)
-
-    def fast_subs(self, subs_dict, skip=None):
-        self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
-        return self
-
-    def insert_front(self, node, if_not_exists=False):
-        if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
-            return
-        if isinstance(node, collections.abc.Iterable):
-            node = list(node)
-            for n in node:
-                n.parent = self
-
-            self._nodes = node + self._nodes
-        else:
-            node.parent = self
-            self._nodes.insert(0, node)
-
-    def insert_before(self, new_node, insert_before, if_not_exists=False):
-        new_node.parent = self
-        assert self._nodes.count(insert_before) == 1
-        idx = self._nodes.index(insert_before)
-
-        if not if_not_exists or self._nodes[idx] != new_node:
-            self._nodes.insert(idx, new_node)
-
-    def insert_after(self, new_node, insert_after, if_not_exists=False):
-        new_node.parent = self
-        assert self._nodes.count(insert_after) == 1
-        idx = self._nodes.index(insert_after) + 1
-
-        if not if_not_exists or not (self._nodes[idx - 1] == new_node
-                                     or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
-            self._nodes.insert(idx, new_node)
-
-    def append(self, node):
-        if isinstance(node, list) or isinstance(node, tuple):
-            for n in node:
-                n.parent = self
-                self._nodes.append(n)
-        else:
-            node.parent = self
-            self._nodes.append(node)
-
-    def take_child_nodes(self):
-        tmp = self._nodes
-        self._nodes = []
-        return tmp
-
-    def replace(self, child, replacements):
-        assert self._nodes.count(child) == 1
-        idx = self._nodes.index(child)
-        del self._nodes[idx]
-        if type(replacements) is list:
-            for e in replacements:
-                e.parent = self
-            self._nodes = self._nodes[:idx] + replacements + self._nodes[idx:]
-        else:
-            replacements.parent = self
-            self._nodes.insert(idx, replacements)
-
-    @property
-    def symbols_defined(self):
-        result = set()
-        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
-                result.update(a.free_symbols)
-            else:
-                result.update(a.symbols_defined)
-        return result
-
-    @property
-    def undefined_symbols(self):
-        result = set()
-        defined_symbols = set()
-        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
-                result.update(a.free_symbols)
-                defined_symbols.update({a.lhs})
-            else:
-                result.update(a.undefined_symbols)
-                defined_symbols.update(a.symbols_defined)
-        return result - defined_symbols
-
-    def __str__(self):
-        return "Block " + ''.join('{!s}\n'.format(node) for node in self._nodes)
-
-    def __repr__(self):
-        return "Block"
-
-
-class PragmaBlock(Block):
-    def __init__(self, pragma_line, nodes):
-        super(PragmaBlock, self).__init__(nodes)
-        self.pragma_line = pragma_line
-        for n in nodes:
-            n.parent = self
-
-    def __repr__(self):
-        return self.pragma_line
-
-
-class LoopOverCoordinate(Node):
-    LOOP_COUNTER_NAME_PREFIX = "ctr"
-    BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
-
-    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False, custom_loop_ctr=None):
-        super(LoopOverCoordinate, self).__init__(parent=None)
-        self.body = body
-        body.parent = self
-        self.coordinate_to_loop_over = coordinate_to_loop_over
-        self.start = start
-        self.stop = stop
-        self.step = step
-        self.body.parent = self
-        self.prefix_lines = []
-        self.is_block_loop = is_block_loop
-        self.custom_loop_ctr = custom_loop_ctr
-
-    def new_loop_with_different_body(self, new_body):
-        result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
-                                    self.step, self.is_block_loop, self.custom_loop_ctr)
-        result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
-        return result
-
-    def subs(self, subs_dict):
-        self.body.subs(subs_dict)
-        if hasattr(self.start, "subs"):
-            self.start = self.start.subs(subs_dict)
-        if hasattr(self.stop, "subs"):
-            self.stop = self.stop.subs(subs_dict)
-        if hasattr(self.step, "subs"):
-            self.step = self.step.subs(subs_dict)
-
-    def fast_subs(self, subs_dict, skip=None):
-        self.body = fast_subs(self.body, subs_dict, skip)
-        if isinstance(self.start, sp.Basic):
-            self.start = fast_subs(self.start, subs_dict, skip)
-        if isinstance(self.stop, sp.Basic):
-            self.stop = fast_subs(self.stop, subs_dict, skip)
-        if isinstance(self.step, sp.Basic):
-            self.step = fast_subs(self.step, subs_dict, skip)
-        return self
-
-    @property
-    def args(self):
-        result = [self.body]
-        for e in [self.start, self.stop, self.step]:
-            if hasattr(e, "args"):
-                result.append(e)
-        return result
-
-    def replace(self, child, replacement):
-        if child == self.body:
-            self.body = replacement
-        elif child == self.start:
-            self.start = replacement
-        elif child == self.step:
-            self.step = replacement
-        elif child == self.stop:
-            self.stop = replacement
-
-    @property
-    def symbols_defined(self):
-        return {self.loop_counter_symbol}
-
-    @property
-    def undefined_symbols(self):
-        result = self.body.undefined_symbols
-        for possible_symbol in [self.start, self.stop, self.step]:
-            if isinstance(possible_symbol, Node) or isinstance(possible_symbol, sp.Basic):
-                result.update(possible_symbol.atoms(sp.Symbol))
-        return result - {self.loop_counter_symbol}
-
-    @staticmethod
-    def get_loop_counter_name(coordinate_to_loop_over):
-        return f"{LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}"
-
-    @staticmethod
-    def get_block_loop_counter_name(coordinate_to_loop_over):
-        return f"{LoopOverCoordinate.BLOCK_LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}"
-
-    @property
-    def loop_counter_name(self):
-        if self.custom_loop_ctr:
-            return self.custom_loop_ctr.name
-        else:
-            if self.is_block_loop:
-                return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
-            else:
-                return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
-
-    @staticmethod
-    def is_loop_counter_symbol(symbol):
-        prefix = LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX
-        if not symbol.name.startswith(prefix):
-            return None
-        if symbol.dtype != create_type('int'):
-            return None
-        coordinate = int(symbol.name[len(prefix) + 1:])
-        return coordinate
-
-    @staticmethod
-    def get_loop_counter_symbol(coordinate_to_loop_over):
-        return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int', nonnegative=True)
-
-    @staticmethod
-    def get_block_loop_counter_symbol(coordinate_to_loop_over):
-        return TypedSymbol(LoopOverCoordinate.get_block_loop_counter_name(coordinate_to_loop_over),
-                           'int',
-                           nonnegative=True)
-
-    @property
-    def loop_counter_symbol(self):
-        if self.custom_loop_ctr:
-            return self.custom_loop_ctr
-        else:
-            if self.is_block_loop:
-                return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
-            else:
-                return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
-
-    @property
-    def is_outermost_loop(self):
-        return get_next_parent_of_type(self, LoopOverCoordinate) is None
-
-    @property
-    def is_innermost_loop(self):
-        return len(self.atoms(LoopOverCoordinate)) == 0
-
-    def __str__(self):
-        return 'for({!s}={!s}; {!s}<{!s}; {!s}+={!s})\n{!s}'.format(self.loop_counter_name, self.start,
-                                                                    self.loop_counter_name, self.stop,
-                                                                    self.loop_counter_name, self.step,
-                                                                    ("\t" + "\t".join(str(self.body).splitlines(True))))
-
-    def __repr__(self):
-        return 'for({!s}={!s}; {!s}<{!s}; {!s}+={!s})'.format(self.loop_counter_name, self.start,
-                                                              self.loop_counter_name, self.stop,
-                                                              self.loop_counter_name, self.step)
-
-
-class SympyAssignment(Node):
-    def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
-        super(SympyAssignment, self).__init__(parent=None)
-        self._lhs_symbol = sp.sympify(lhs_symbol)
-        self._rhs = sp.sympify(rhs_expr)
-        self._is_const = is_const
-        self._is_declaration = self.__is_declaration()
-        self._use_auto = use_auto
-
-    def __is_declaration(self):
-        from pystencils.typing import CastFunc
-        if isinstance(self._lhs_symbol, CastFunc):
-            return False
-        if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)):
-            return False
-        return True
-
-    @property
-    def lhs(self):
-        return self._lhs_symbol
-
-    @property
-    def rhs(self):
-        return self._rhs
-
-    @lhs.setter
-    def lhs(self, new_value):
-        self._lhs_symbol = new_value
-        self._is_declaration = self.__is_declaration()
-
-    @rhs.setter
-    def rhs(self, new_rhs_expr):
-        self._rhs = new_rhs_expr
-
-    def subs(self, subs_dict):
-        self.lhs = fast_subs(self.lhs, subs_dict)
-        self.rhs = fast_subs(self.rhs, subs_dict)
-
-    def fast_subs(self, subs_dict, skip=None):
-        self.lhs = fast_subs(self.lhs, subs_dict, skip)
-        self.rhs = fast_subs(self.rhs, subs_dict, skip)
-        return self
-
-    def optimize(self, optimizations):
-        try:
-            from sympy.codegen.rewriting import optimize
-            self.rhs = optimize(self.rhs, optimizations)
-        except Exception:
-            pass
-
-    @property
-    def args(self):
-        return [self._lhs_symbol, self.rhs]
-
-    @property
-    def symbols_defined(self):
-        if not self._is_declaration:
-            return set()
-        return {self._lhs_symbol}
-
-    @property
-    def undefined_symbols(self):
-        result = {s for s in self.rhs.free_symbols if not isinstance(s, sp.Indexed)}
-        # Add loop counters if there a field accesses
-        loop_counters = set()
-        for symbol in result:
-            if isinstance(symbol, Field.Access):
-                for i in range(len(symbol.offsets)):
-                    loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i))
-        result.update(loop_counters)
-        
-        result.update(self._lhs_symbol.atoms(sp.Symbol))
-        
-        return result
-
-    @property
-    def is_declaration(self):
-        return self._is_declaration
-
-    @property
-    def is_const(self):
-        return self._is_const
-
-    @property
-    def use_auto(self):
-        return self._use_auto
-
-    def replace(self, child, replacement):
-        if child == self.lhs:
-            replacement.parent = self
-            self.lhs = replacement
-        elif child == self.rhs:
-            replacement.parent = self
-            self.rhs = replacement
-        else:
-            raise ValueError(f'{replacement} is not in args of {self.__class__}')
-
-    def __repr__(self):
-        return repr(self.lhs) + " ← " + repr(self.rhs)
-
-    def _repr_html_(self):
-        printed_lhs = sp.latex(self.lhs)
-        printed_rhs = sp.latex(self.rhs)
-        return f"${printed_lhs} \\leftarrow {printed_rhs}$"
-
-    def __hash__(self):
-        return hash((self.lhs, self.rhs))
-
-    def __eq__(self, other):
-        return type(self) is type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
-
-
-class ResolvedFieldAccess(sp.Indexed):
-    def __new__(cls, base, linearized_index, field, offsets, idx_coordinate_values):
-        if not isinstance(base, sp.IndexedBase):
-            assert isinstance(base, TypedSymbol)
-            base = sp.IndexedBase(base, shape=(1,))
-            assert isinstance(base.label, TypedSymbol)
-        obj = super(ResolvedFieldAccess, cls).__new__(cls, base, linearized_index)
-        obj.field = field
-        obj.offsets = offsets
-        obj.idx_coordinate_values = idx_coordinate_values
-        return obj
-
-    def _eval_subs(self, old, new):
-        return ResolvedFieldAccess(self.args[0],
-                                   self.args[1].subs(old, new),
-                                   self.field, self.offsets, self.idx_coordinate_values)
-
-    def fast_subs(self, substitutions, skip=None):
-        if self in substitutions:
-            return substitutions[self]
-        return ResolvedFieldAccess(self.args[0].subs(substitutions),
-                                   self.args[1].subs(substitutions),
-                                   self.field, self.offsets, self.idx_coordinate_values)
-
-    def _hashable_content(self):
-        super_class_contents = super(ResolvedFieldAccess, self)._hashable_content()
-        return super_class_contents + tuple(self.offsets) + (repr(self.idx_coordinate_values), hash(self.field))
-
-    @property
-    def typed_symbol(self):
-        return self.base.label
-
-    def __str__(self):
-        top = super(ResolvedFieldAccess, self).__str__()
-        return f"{top} ({self.typed_symbol.dtype})"
-
-    def __getnewargs__(self):
-        return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values
-
-    def __getnewargs_ex__(self):
-        return (self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values), {}
-
-
-class TemporaryMemoryAllocation(Node):
-    """Node for temporary memory buffer allocation.
-
-    Always allocates aligned memory.
-
-    Args:
-        typed_symbol: symbol used as pointer (has to be typed)
-        size: number of elements to allocate
-        align_offset: the align_offset's element is aligned
-    """
-
-    def __init__(self, typed_symbol: TypedSymbol, size, align_offset):
-        super(TemporaryMemoryAllocation, self).__init__(parent=None)
-        self.symbol = typed_symbol
-        self.size = size
-        self.headers = ['<stdlib.h>']
-        self._align_offset = align_offset
-
-    @property
-    def symbols_defined(self):
-        return {self.symbol}
-
-    @property
-    def undefined_symbols(self):
-        if isinstance(self.size, sp.Basic):
-            return self.size.atoms(sp.Symbol)
-        else:
-            return set()
-
-    @property
-    def args(self):
-        return [self.symbol]
-
-    def offset(self, byte_alignment):
-        """Number of ELEMENTS to skip for a pointer that is aligned to byte_alignment."""
-        np_dtype = self.symbol.dtype.base_type.numpy_dtype
-        assert byte_alignment % np_dtype.itemsize == 0
-        return -self._align_offset % (byte_alignment / np_dtype.itemsize)
-
-
-class TemporaryMemoryFree(Node):
-    def __init__(self, alloc_node):
-        super(TemporaryMemoryFree, self).__init__(parent=None)
-        self.alloc_node = alloc_node
-
-    @property
-    def symbol(self):
-        return self.alloc_node.symbol
-
-    def offset(self, byte_alignment):
-        return self.alloc_node.offset(byte_alignment)
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    @property
-    def args(self):
-        return []
-
-
-def early_out(condition):
-    from pystencils.cpu.vectorization import vec_all
-    return Conditional(vec_all(condition), Block([SkipIteration()]))
-
-
-def get_dummy_symbol(dtype='bool'):
-    return TypedSymbol(f'dummy{uuid.uuid4().hex}', create_type(dtype))
-
-
-class SourceCodeComment(Node):
-    def __init__(self, text):
-        self.text = text
-
-    @property
-    def args(self):
-        return []
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    def __str__(self):
-        return "/* " + self.text + " */"
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class EmptyLine(Node):
-    def __init__(self):
-        pass
-
-    @property
-    def args(self):
-        return []
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    def __str__(self):
-        return ""
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class ConditionalFieldAccess(sp.Function):
-    """
-    :class:`pystencils.Field.Access` that is only executed if a certain condition is met.
-    Can be used, for instance, for out-of-bound checks.
-    """
-
-    def __new__(cls, field_access, outofbounds_condition, outofbounds_value=0):
-        return sp.Function.__new__(cls, field_access, outofbounds_condition, sp.S(outofbounds_value))
-
-    @property
-    def access(self):
-        return self.args[0]
-
-    @property
-    def outofbounds_condition(self):
-        return self.args[1]
-
-    @property
-    def outofbounds_value(self):
-        return self.args[2]
-
-    def __getnewargs__(self):
-        return self.access, self.outofbounds_condition, self.outofbounds_value
-
-    def __getnewargs_ex__(self):
-        return (self.access, self.outofbounds_condition, self.outofbounds_value), {}
--- a/src/pystencils/backend/__init__.py
+++ b/src/pystencils/backend/__init__.py
--- a/src/pystencils/backend/ast/__init__.py
+++ b/src/pystencils/backend/ast/__init__.py
+from .astnode import PsAstNode
+from .iteration import dfs_preorder, dfs_postorder
+
+__all__ = [
+    "PsAstNode",
+    "dfs_preorder",
+    "dfs_postorder",
+]
No results found