Frederik Hennig · Frederik Hennig · Daniel Bauer · Frederik Hennig · Frederik Hennig · Daniel Bauer
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -286,7 +286,7 @@ mypy-typecheck:
 tests-and-coverage:
  stage: "Unit Tests"
  needs: []
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
  before_script:
    - pip install -e .[tests]
  script:
@@ -318,7 +318,7 @@ tests-and-coverage:


 build-documentation:
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full:cupy12.3
  stage: docs
  needs: []
  before_script:

--- a/docs/Makefile
+++ b/docs/Makefile
@@ -12,14 +12,17 @@ BUILDDIR      = build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

-.PHONY: help Makefile
+.PHONY: help html clean

 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
+# %: Makefile
+
+html:
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

 clean:
 	rm -rf source/reference/generated
+	rm -rf source/api/generated
 	rm -rf source/backend/generated
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/docs/source/api/codegen.rst
+++ b/docs/source/api/codegen.rst
+pystencils.codegen
+==================
+
+.. module:: pystencils.codegen
+
+Invocation
+----------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  create_kernel
+  
+Configuration
+-------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  CreateKernelConfig
+  CpuOptimConfig
+  OpenMpConfig
+  VectorizationConfig
+  GpuIndexingConfig
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  AUTO
+
+Target Specification
+--------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  Target
+
+Code Generation Drivers
+-----------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  driver.DefaultKernelCreationDriver
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  get_driver
+
+Output Code Objects
+-------------------
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  Kernel
+  GpuKernel
+  Parameter
+  GpuThreadsRange
--- a/docs/source/reference/api/field.rst
+++ b/docs/source/reference/api/field.rst
--- a/docs/source/reference/api/codegen.rst
+++ b/docs/source/reference/api/codegen.rst
-Code Generator and Configuration
-================================
+pystencils.jit
+==============

-.. module:: pystencils.kernelcreation
+.. module:: pystencils.jit
+
+Base Infrastructure
+-------------------

 .. autosummary::
  :toctree: generated
  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+    KernelWrapper
+    JitBase
+    NoJit

-  create_kernel
+.. autodata:: no_jit

-.. module:: pystencils.config
+Legacy CPU JIT
+--------------

 .. autosummary::
  :toctree: generated
  :nosignatures:
  :template: autosummary/entire_class.rst

-  CreateKernelConfig
-  CpuOptimConfig
-  OpenMpConfig
-  VectorizationConfig
-  GpuIndexingConfig
+  LegacyCpuJit
+
+CuPy-based GPU JIT
+------------------

 .. autosummary::
  :toctree: generated
  :nosignatures:
+  :template: autosummary/entire_class.rst

-  AUTO
\ No newline at end of file
+  CupyJit
+  CupyKernelWrapper
+  LaunchGrid
--- a/docs/source/reference/api/sympyextensions.rst
+++ b/docs/source/reference/api/sympyextensions.rst
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -18,7 +18,6 @@ who wish to customize or extend the behaviour of the code generator in their app
    platforms
    transformations
    errors
-    jit
    extensions

 Internal Representation

--- a/docs/source/backend/jit.rst
+++ b/docs/source/backend/jit.rst
-************************
-Just-In-Time Compilation
-************************
-
-.. automodule:: pystencils.backend.jit
-    :members:
--- a/docs/source/backend/objects.rst
+++ b/docs/source/backend/objects.rst
@@ -76,7 +76,7 @@ The above alignment property, for instance, may be added to a pointer symbol by
 to document its assumption that the pointer be properly aligned, in order to emit aligned load and store instructions.
 It then becomes the responsibility of the runtime system embedding the kernel to check this prequesite before calling the kernel.
 To make sure this information becomes visible, any properties attached to symbols exposed as kernel parameters will also
-be added to their respective `KernelParameter` instance.
+be added to their respective `Parameter` instance.

 Buffers
 -------
@@ -110,7 +110,7 @@ The context makes sure to avoid name conflicts between buffers.
 API Documentation
 =================

-.. automodule:: pystencils.backend.properties
+.. automodule:: pystencils.codegen.properties
    :members:

 .. automodule:: pystencils.backend.memory

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -69,28 +69,36 @@ Topics
 ------

 .. toctree::
-   :maxdepth: 1
-   :caption: Getting Started
+  :maxdepth: 1
+  :caption: Getting Started

-   installation
-   tutorials/index
+  installation
+  tutorials/index

 .. toctree::
-   :maxdepth: 1
-   :caption: Reference Guides
+  :maxdepth: 1
+  :caption: Reference Guides

-   reference/symbolic_language
-   reference/kernelcreation
-   reference/gpu_kernels
-   reference/types
-   reference/api/index
+  reference/symbolic_language
+  reference/kernelcreation
+  reference/gpu_kernels
+  reference/types

 .. toctree::
-   :maxdepth: 1
-   :caption: Advanced
+  :maxdepth: 1
+  :caption: API

-   migration
-   backend/index
+  api/field
+  api/sympyextensions
+  api/codegen
+  api/jit
+
+.. toctree::
+  :maxdepth: 1
+  :caption: Advanced
+
+  migration
+  backend/index

 Projects using pystencils
 -------------------------

--- a/docs/source/reference/api/index.rst
+++ b/docs/source/reference/api/index.rst
-***
-API
-***
-
-Modules
-=======
-
-.. toctree::
-    :maxdepth: 1
-
-    field
-    sympyextensions
-    codegen
--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
@@ -49,9 +49,9 @@ ps.inspect(kernel)
 ```

 The `kernel` object returned by the code generator in above snippet is an instance
-of the {py:class}`GpuKernelFunction` class.
-It extends {py:class}`KernelFunction` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernelFunction.threads_range>`
+of the {py:class}`GpuKernel` class.
+It extends {py:class}`Kernel` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
 property, which tells us how many threads the kernel is expecting to be executed with:

 ```{code-cell} ipython3
@@ -208,12 +208,10 @@ only a part of the triangle is being processed.

 ```{eval-rst}
 .. autosummary::
-  :toctree: generated
  :nosignatures:
-  :template: autosummary/recursive_class.rst

-  pystencils.backend.kernelfunction.GpuKernelFunction
-  pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
+  pystencils.codegen.GpuKernel
+  pystencils.jit.gpu_cupy.CupyKernelWrapper
 ```

 :::{admonition} Developers To Do:
@@ -226,4 +224,4 @@ only a part of the triangle is being processed.
 [cupy]: https://cupy.dev "CuPy Homepage"
 [numpy]: https://numpy.org "NumPy Homepage"
 [nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver"
-[cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
\ No newline at end of file
+[cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
@@ -34,17 +34,19 @@ and their effects on the generated kernel.

 ## Running the Code Generator

-The primary way to invoke the code generation engine is through the `create_kernel` function.
+The primary way to invoke the code generation engine is through the {any}`create_kernel` function.
 It takes two arguments:
 - the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`.
+- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.codegen.config.CreateKernelConfig>`.

 ```{eval-rst}
+.. currentmodule:: pystencils.codegen
+
 .. autosummary::
  :nosignatures:

-  pystencils.kernelcreation.create_kernel
-  pystencils.config.CreateKernelConfig
+  create_kernel
+  CreateKernelConfig
 ```

 For a simple kernel, an invocation of the code generator might look like this:
@@ -82,7 +84,7 @@ The above snippet defines a five-point-stencil Jacobi update. A few noteworthy t

 ## Inspecting the Generated Code

-The object returned by the code generator, here named `kernel`, is an instance of the {any}`KernelFunction` class.
+The object returned by the code generator, here named `kernel`, is an instance of the {any}`Kernel` class.
 This object stores the kernel's name, its list of parameters, the set of fields it operates on, and its hardware target.
 Also, it of course holds the kernel itself, in the form of an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST).
 This tree can be printed out as compilable code in the target language (C++ or, in this case, CUDA),
@@ -110,21 +112,14 @@ their interaction and effects, use cases and caveats.
 Pystencils supports code generation for a variety of CPU and GPU hardware.

 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen

 .. autosummary::
  :nosignatures:

  CreateKernelConfig.target
-
-.. module:: pystencils.target
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-  :template: autosummary/recursive_class.rst
-
  Target
+
 ```

 ### Data Types
@@ -176,7 +171,7 @@ are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKe
 ```{code-cell} ipython3
 :tags: [remove-input]

-driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates.materialized_ispace, show_cpp=False)
 ```
@@ -186,7 +181,7 @@ To learn more about inspecting code after different stages of the code generator
 :::

 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen

 .. autosummary::
  :nosignatures:
@@ -220,7 +215,7 @@ only one of which can be specified at a time:
 :::

 ```{eval-rst}
-.. currentmodule:: pystencils.config
+.. currentmodule:: pystencils.codegen

 .. autosummary::
  :nosignatures:
@@ -260,7 +255,7 @@ boundary values or exchange data in MPI-parallel simulations.
 ##### Automatic Ghost Layers

 The easiest way to define an iteration space with ghost layers
-is to set `ghost_layers=ps.config.AUTO`, which is also the default
+is to set `ghost_layers=ps.AUTO`, which is also the default
 when no iteration space options are specified.
 In this case, the code generator will examine the kernel to find the maximum range
 of its stencil -- that is, the maximum neighbor offset encountered in any field access.
@@ -281,11 +276,11 @@ To illustrate, the following kernel accesses neighbor nodes with a maximum offse
 ```{code-cell} ipython3
 ranged_update = ps.Assignment(u.center(), v[-2, -1] + v[2, 1])

-cfg = ps.CreateKernelConfig(ghost_layers=ps.config.AUTO)
+cfg = ps.CreateKernelConfig(ghost_layers=ps.AUTO)
 kernel = ps.create_kernel(ranged_update, cfg)
 ```

-With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded).
+With `ghost_layers=ps.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded).

 ```{code-cell} ipython3
 :tags: [remove-input]
@@ -506,22 +501,7 @@ assignments = [
 ```

 ```{code-cell} ipython3
-driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+driver = ps.codegen.get_driver(cfg, retain_intermediates=True)
 kernel = driver(assignments)
 ps.inspect(driver.intermediates)
 ```
-
-## API: Kernel Parameters and Function Objects
-
-```{eval-rst}
-.. module:: pystencils.backend.kernelfunction
-
-.. autosummary::
-  :toctree: generated
-  :nosignatures:
-  :template: autosummary/entire_class.rst
-
-  KernelParameter
-  KernelFunction
-  GpuKernelFunction
-```
--- a/docs/source/tutorials/01_tutorial_getting_started.ipynb
+++ b/docs/source/tutorials/01_tutorial_getting_started.ipynb
 %% Cell type:code id: tags:

 ``` python
 import pystencils as ps
 from pystencils import plot as plt

 import numpy as np
 import sympy as sp
 ```

 %% Cell type:markdown id: tags:

 # Tutorial 01: Getting Started


 ## Overview

 *pystencils* is a package that can speed up computations on *numpy* arrays. All computations are carried out fully parallel on CPUs (single node with OpenMP, multiple nodes with MPI) or on GPUs.
 It is suited for applications that run the same operation on *numpy* arrays multiple times. It can be used to accelerate computations on images or voxel fields. Its main application, however, are numerical simulations using finite differences, finite volumes, or lattice Boltzmann methods.
 There already exist a variety of packages to speed up numeric Python code. One could use pure numpy or solutions that compile your code, like *Cython* and *numba*. See [this page](demo_benchmark.ipynb) for a comparison of these tools.

 ![Stencil](../_static/img/pystencils_stencil_four_points_with_arrows.svg)

 As the name suggests, *pystencils* was developed for **stencil codes**, i.e. operations that update array elements using only a local neighborhood.
 It generates C code, compiles it behind the scenes, and lets you call the compiled C function as if it was a native Python function.
 But lets not dive too deep into the concepts of *pystencils* here, they are covered in detail in the following tutorials. Let's instead look at a simple example, that computes the average neighbor values of a *numpy* array. Therefor we first create two rather large arrays for input and output:

 %% Cell type:code id: tags:

 ``` python
 input_arr = np.random.rand(1024, 1024)
 output_arr = np.zeros_like(input_arr)
 ```

 %% Cell type:markdown id: tags:

 We first implement a version of this algorithm using pure numpy and benchmark it.

 %% Cell type:code id: tags:

 ``` python
 def numpy_kernel():
    output_arr[1:-1, 1:-1] = input_arr[2:, 1:-1] + input_arr[:-2, 1:-1] + \
                             input_arr[1:-1, 2:] + input_arr[1:-1, :-2]
 ```

 %% Cell type:code id: tags:

 ``` python
 %%timeit
 numpy_kernel()
 ```

 %% Output

    4.74 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

 %% Cell type:markdown id: tags:

 Now lets see how to run the same algorithm with *pystencils*.

 %% Cell type:code id: tags:

 ``` python
 src, dst = ps.fields(src=input_arr, dst=output_arr)

 symbolic_description = ps.Assignment(dst[0,0],
                                     (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / 4)
 symbolic_description
 ```

 %% Output


    $\displaystyle {dst}_{(0,0)} \leftarrow_{} \frac{{src}_{(1,0)}}{4} + \frac{{src}_{(0,1)}}{4} + \frac{{src}_{(0,-1)}}{4} + \frac{{src}_{(-1,0)}}{4}$
             src_E   src_N   src_S   src_W
    dst_C := ───── + ───── + ───── + ─────
               4       4       4       4

 %% Cell type:code id: tags:

 ``` python
 plt.figure(figsize=(3,3))
 ps.stencil.plot_expression(symbolic_description.rhs)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Here we first have created a symbolic notation of the stencil itself. This representation is built on top of *sympy* and is explained in detail in the next section.
 This description is then compiled and loaded as a Python function.

 %% Cell type:code id: tags:

 ``` python
 kernel = ps.create_kernel(symbolic_description).compile()
 ```

 %% Cell type:markdown id: tags:

 This whole process might seem overly complicated. We have already spent more lines of code than we needed for the *numpy* implementation and don't have anything running yet! However, this multi-stage process of formulating the algorithm symbolically, and just in the end actually running it, is what makes *pystencils* faster and more flexible than other approaches.

 Now finally lets benchmark the *pystencils* kernel.

 %% Cell type:code id: tags:

 ``` python
 def pystencils_kernel():
    kernel(src=input_arr, dst=output_arr)
 ```

 %% Cell type:code id: tags:

 ``` python
 %%timeit
 pystencils_kernel()
 ```

 %% Output

    548 μs ± 34.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

 %% Cell type:markdown id: tags:

 This benchmark shows that *pystencils* is a lot faster than pure *numpy*, especially for large arrays.
 If you are interested in performance details and comparison to other packages like Cython, have a look at [this page](demo_benchmark.ipynb).


 %% Cell type:markdown id: tags:

 ## Short *sympy* introduction

 In this tutorial we continue with a short *sympy* introduction, since the symbolic kernel definition is built on top of this package. If you already know *sympy* you can skip this section.
 You can also read the full [sympy documentation here](http://docs.sympy.org/latest/index.html).

 %% Cell type:code id: tags:

 ``` python
 import sympy as sp
 sp.init_printing()  # enable nice LaTeX output
 ```

 %% Cell type:markdown id: tags:

 *sympy* is a package for symbolic calculation. So first we need some symbols:

 %% Cell type:code id: tags:

 ``` python
 x = sp.Symbol("x")
 y = sp.Symbol("y")
 type(x)
 ```

 %% Output

    sympy.core.symbol.Symbol

 %% Cell type:markdown id: tags:

 The usual mathematical operations are defined for symbols:

 %% Cell type:code id: tags:

 ``` python
 expr = x**2 * ( y + x + 5) + x**2
 expr
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2}$
     2                2
    x ⋅(x + y + 5) + x

 %% Cell type:markdown id: tags:

 Now we can do all sorts of operations on these expressions: expand them, factor them, substitute variables:

 %% Cell type:code id: tags:

 ``` python
 expr.expand()
 ```

 %% Output


    $\displaystyle x^{3} + x^{2} y + 6 x^{2}$
     3    2        2
    x  + x ⋅y + 6⋅x

 %% Cell type:code id: tags:

 ``` python
 expr.factor()
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 6\right)$
     2
    x ⋅(x + y + 6)

 %% Cell type:code id: tags:

 ``` python
 expr.subs(y, sp.cos(x))
 ```

 %% Output


    $\displaystyle x^{2} \left(x + \cos{\left(x \right)} + 5\right) + x^{2}$
     2                     2
    x ⋅(x + cos(x) + 5) + x

 %% Cell type:markdown id: tags:

 We can also built equations and solve them

 %% Cell type:code id: tags:

 ``` python
 eq = sp.Eq(expr, 1)
 eq
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2} = 1$
     2                2
    x ⋅(x + y + 5) + x  = 1

 %% Cell type:code id: tags:

 ``` python
 sp.solve(sp.Eq(expr, 1), y)
 ```

 %% Output


    $\displaystyle \left[ - x - 6 + \frac{1}{x^{2}}\right]$
    ⎡         1 ⎤
    ⎢-x - 6 + ──⎥
    ⎢          2⎥
    ⎣         x ⎦

 %% Cell type:markdown id: tags:

 A *sympy* expression is represented by an abstract syntax tree (AST), which can be inspected and modified.

 %% Cell type:code id: tags:

 ``` python
 expr
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2}$
     2                2
    x ⋅(x + y + 5) + x

 %% Cell type:code id: tags:

 ``` python
 ps.to_dot(expr, graph_style={'size': "9.5,12.5"} )
 ```

 %% Output


    <graphviz.sources.Source at 0x7e3154f58d30>

 %% Cell type:markdown id: tags:

 Programatically the children node type is acessible as ``expr.func`` and its children as ``expr.args``.
 With these members a tree can be traversed and modified.

 %% Cell type:code id: tags:

 ``` python
 expr.func
 ```

 %% Output

    sympy.core.add.Add

 %% Cell type:code id: tags:

 ``` python
 expr.args
 ```

 %% Output


    $\displaystyle \left( x^{2}, \  x^{2} \left(x + y + 5\right)\right)$
    ⎛ 2   2            ⎞
    ⎝x , x ⋅(x + y + 5)⎠

 %% Cell type:markdown id: tags:

 ## Using *pystencils*


 ### Fields

 *pystencils* is a module to generate code for stencil operations.
 One has to specify an update rule for each element of an array, with optional dependencies to neighbors.
 This is done use pure *sympy* with one addition: **Fields**.

 Fields represent a multidimensional array, where some dimensions are considered *spatial*, and some as *index* dimensions. Spatial coordinates are given relative (i.e. one can specify "the current cell" and "the left neighbor") whereas index dimensions are used to index multiple values per cell.

 %% Cell type:code id: tags:

 ``` python
 my_field = ps.fields("f(3) : double[2D]")
 ```

 %% Cell type:markdown id: tags:

 Neighbors are labeled according to points on a compass where the first coordinate is west/east, second coordinate north/south and third coordinate top/bottom.

 %% Cell type:code id: tags:

 ``` python
 field_access = my_field[1, 0](1)
 field_access
 ```

 %% Output


    $\displaystyle {f}_{(1,0)}^{1}$
    f_E__1

 %% Cell type:markdown id: tags:

 The result of indexing a field is an instance of ``Field.Access``. This class is a subclass of a *sympy* Symbol and thus can be used whereever normal symbols can be used. It is just like a normal symbol with some additional information attached to it.

 %% Cell type:code id: tags:

 ``` python
 isinstance(field_access, sp.Symbol)
 ```

 %% Output

    True

 %% Cell type:markdown id: tags:

 ### Building our first stencil kernel

 Lets start by building a simple filter kernel. We create a field representing an image, then define a edge detection filter on the third pixel component which is blue for an RGB image.

 %% Cell type:code id: tags:

 ``` python
 img_field = ps.fields("img(4): [2D]")
 ```

 %% Cell type:code id: tags:

 ``` python
 w1, w2 = sp.symbols("w_1 w_2")
 color = 2
 sobel_x = (-w2 * img_field[-1,0](color) - w1 * img_field[-1,-1](color) - w1 * img_field[-1, +1](color) \
           +w2 * img_field[+1,0](color) + w1 * img_field[+1,-1](color) - w1 * img_field[+1, +1](color))**2
 sobel_x
 ```

 %% Output


    $\displaystyle \left({img}_{(1,0)}^{2} w_{2} - {img}_{(1,1)}^{2} w_{1} - {img}_{(-1,1)}^{2} w_{1} + {img}_{(1,-1)}^{2} w_{1} - {img}_{(-1,-1)}^{2} w_{1} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    (img_E__2⋅w₂ - img_NE__2⋅w₁ - img_NW__2⋅w₁ + img_SE__2⋅w₁ - img_SW__2⋅w₁ - img
    
             2
    _W__2⋅w₂)

 %% Cell type:markdown id: tags:

 We have mixed some standard *sympy* symbols into this expression to possibly give the different directions different weights. The complete expression is still a valid *sympy* expression, so all features of *sympy* work on it. Lets for example now fix one weight by substituting it with a constant.

 %% Cell type:code id: tags:

 ``` python
 sobel_x = sobel_x.subs(w1, 0.5)
 sobel_x
 ```

 %% Output


    $\displaystyle \left({img}_{(1,0)}^{2} w_{2} - 0.5 {img}_{(1,1)}^{2} - 0.5 {img}_{(-1,1)}^{2} + 0.5 {img}_{(1,-1)}^{2} - 0.5 {img}_{(-1,-1)}^{2} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    (img_E__2⋅w₂ - 0.5⋅img_NE__2 - 0.5⋅img_NW__2 + 0.5⋅img_SE__2 - 0.5⋅img_SW__2 -
    
                 2
     img_W__2⋅w₂)

 %% Cell type:markdown id: tags:

 Now lets built an executable kernel out of it, which writes the result to a second field. Assignments are created using *pystencils* `Assignment` class, that gets the left- and right hand side of the assignment.

 %% Cell type:code id: tags:

 ``` python
 dst_field = ps.fields('dst: [2D]' )
 update_rule = ps.Assignment(dst_field[0,0], sobel_x)
 update_rule
 ```

 %% Output


    $\displaystyle {dst}_{(0,0)} \leftarrow_{} \left({img}_{(1,0)}^{2} w_{2} - 0.5 {img}_{(1,1)}^{2} - 0.5 {img}_{(-1,1)}^{2} + 0.5 {img}_{(1,-1)}^{2} - 0.5 {img}_{(-1,-1)}^{2} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    dst_C := (img_E__2⋅w₂ - 0.5⋅img_NE__2 - 0.5⋅img_NW__2 + 0.5⋅img_SE__2 - 0.5⋅im
    
                          2
    g_SW__2 - img_W__2⋅w₂)

 %% Cell type:markdown id: tags:

 Next we can see *pystencils* in action which creates a kernel for us.

 %% Cell type:code id: tags:

 ``` python
 from pystencils import create_kernel
 ast = create_kernel(update_rule, cpu_openmp=False)
 compiled_kernel = ast.compile()
 ```

 %% Output

    /media/data/fhennig/research-hpc/projects/2024_pystencils_nbackend/pystencils/src/pystencils/config.py:327: FutureWarning: The `cpu_openmp` option of CreateKernelConfig is deprecated and will be removed in pystencils 2.1. Use `cpu_optim.openmp` instead.
      warn(

 %% Cell type:markdown id: tags:

 This compiled kernel is now just an ordinary Python function.
 Now lets grab an image to apply this filter to:

 %% Cell type:code id: tags:

 ``` python
 try:
    import requests
    import imageio.v2 as imageio
    from io import BytesIO

    response = requests.get("https://www.python.org/static/community_logos/python-logo-master-v3-TM.png")
    img = imageio.imread(BytesIO(response.content)).astype(np.double)
    img /= img.max()
    plt.imshow(img);
 except ImportError:
    print("No requests or imageio installed")
    img = np.random.random((82, 290, 4))
 ```

 %% Output

    No requests or imageio installed

 %% Cell type:code id: tags:

 ``` python
 filtered_image = np.zeros_like(img[..., 0])
 # here we call the compiled stencil function
 compiled_kernel(img=img, dst=filtered_image, w_2=0.5)
 plt.imshow(filtered_image, cmap='gray');
 ```

 %% Output



 %% Cell type:markdown id: tags:

 ### Digging into *pystencils*

 On our way we have created an ``ast``-object. We can inspect this, to see what *pystencils* actually does.

 %% Cell type:code id: tags:

 ``` python
 #   TODO nbackend
 # ps.to_dot(ast, graph_style={'size': "9.5,12.5"})
 ```

 %% Cell type:markdown id: tags:

 *pystencils* also builds a tree structure of the program, where each `Assignment` node internally again has a *sympy* AST which is not printed here. Out of this representation *C* code can be generated:

 %% Cell type:code id: tags:

 ``` python
 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Behind the scenes this code is compiled into a shared library and made available as a Python function. Before compiling this function we can modify the AST object, for example to parallelize it with OpenMP.

 %% Cell type:code id: tags:

 ``` python
 ast = ps.create_kernel(
    update_rule,
-    cpu_optim = ps.config.CpuOptimConfig(
-        openmp=ps.config.OpenMpConfig(num_threads=2))
+    cpu_optim = ps.CpuOptimConfig(
+        openmp=ps.OpenMpConfig(num_threads=2))
    )

 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 ### Fixed array sizes

 Since we already know the arrays to which the kernel should be applied, we can
 create *Field* objects with fixed size, based on a numpy array:

 %% Cell type:code id: tags:

 ``` python
 img_field, dst_field = ps.fields("I(4), dst : [2D]", I=img.astype(np.double), dst=filtered_image)

 sobel_x = -2 * img_field[-1,0](1) - img_field[-1,-1](1) - img_field[-1, +1](1) \
         +2 * img_field[+1,0](1) + img_field[+1,-1](1) - img_field[+1, +1](1)
 update_rule = ps.Assignment(dst_field[0,0], sobel_x)

 ast = create_kernel(update_rule)
 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Compare this code to the version above. In this code the loop bounds and array offsets are constants, which usually leads to faster kernels.

 %% Cell type:markdown id: tags:

 ### Running on GPU

 If you have a GPU and [cupy](https://cupy.dev/) installed, *pystencils* can run your kernel on the GPU as well. You can find more details about this in the GPU tutorial.

 %% Cell type:code id: tags:

 ``` python
 try:
    import cupy
    from pystencils.gpu import BlockIndexing

    gpu_ast = create_kernel(update_rule, target=ps.Target.GPU,
                            gpu_indexing=BlockIndexing,
                            gpu_indexing_params={'blockSize': (64, 1, 1)})

    ps.show_code(gpu_ast)
 except ImportError:
    print("Please install cupy for GPU support")
 ```

 %% Output

    Please install cupy for GPU support

 %% Cell type:code id: tags:

 ``` python
 import pystencils as ps
 from pystencils import plot as plt

 import numpy as np
 import sympy as sp
 ```

 %% Cell type:markdown id: tags:

 # Tutorial 01: Getting Started


 ## Overview

 *pystencils* is a package that can speed up computations on *numpy* arrays. All computations are carried out fully parallel on CPUs (single node with OpenMP, multiple nodes with MPI) or on GPUs.
 It is suited for applications that run the same operation on *numpy* arrays multiple times. It can be used to accelerate computations on images or voxel fields. Its main application, however, are numerical simulations using finite differences, finite volumes, or lattice Boltzmann methods.
 There already exist a variety of packages to speed up numeric Python code. One could use pure numpy or solutions that compile your code, like *Cython* and *numba*. See [this page](demo_benchmark.ipynb) for a comparison of these tools.

 ![Stencil](../_static/img/pystencils_stencil_four_points_with_arrows.svg)

 As the name suggests, *pystencils* was developed for **stencil codes**, i.e. operations that update array elements using only a local neighborhood.
 It generates C code, compiles it behind the scenes, and lets you call the compiled C function as if it was a native Python function.
 But lets not dive too deep into the concepts of *pystencils* here, they are covered in detail in the following tutorials. Let's instead look at a simple example, that computes the average neighbor values of a *numpy* array. Therefor we first create two rather large arrays for input and output:

 %% Cell type:code id: tags:

 ``` python
 input_arr = np.random.rand(1024, 1024)
 output_arr = np.zeros_like(input_arr)
 ```

 %% Cell type:markdown id: tags:

 We first implement a version of this algorithm using pure numpy and benchmark it.

 %% Cell type:code id: tags:

 ``` python
 def numpy_kernel():
    output_arr[1:-1, 1:-1] = input_arr[2:, 1:-1] + input_arr[:-2, 1:-1] + \
                             input_arr[1:-1, 2:] + input_arr[1:-1, :-2]
 ```

 %% Cell type:code id: tags:

 ``` python
 %%timeit
 numpy_kernel()
 ```

 %% Output

    4.74 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)

 %% Cell type:markdown id: tags:

 Now lets see how to run the same algorithm with *pystencils*.

 %% Cell type:code id: tags:

 ``` python
 src, dst = ps.fields(src=input_arr, dst=output_arr)

 symbolic_description = ps.Assignment(dst[0,0],
                                     (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / 4)
 symbolic_description
 ```

 %% Output


    $\displaystyle {dst}_{(0,0)} \leftarrow_{} \frac{{src}_{(1,0)}}{4} + \frac{{src}_{(0,1)}}{4} + \frac{{src}_{(0,-1)}}{4} + \frac{{src}_{(-1,0)}}{4}$
             src_E   src_N   src_S   src_W
    dst_C := ───── + ───── + ───── + ─────
               4       4       4       4

 %% Cell type:code id: tags:

 ``` python
 plt.figure(figsize=(3,3))
 ps.stencil.plot_expression(symbolic_description.rhs)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Here we first have created a symbolic notation of the stencil itself. This representation is built on top of *sympy* and is explained in detail in the next section.
 This description is then compiled and loaded as a Python function.

 %% Cell type:code id: tags:

 ``` python
 kernel = ps.create_kernel(symbolic_description).compile()
 ```

 %% Cell type:markdown id: tags:

 This whole process might seem overly complicated. We have already spent more lines of code than we needed for the *numpy* implementation and don't have anything running yet! However, this multi-stage process of formulating the algorithm symbolically, and just in the end actually running it, is what makes *pystencils* faster and more flexible than other approaches.

 Now finally lets benchmark the *pystencils* kernel.

 %% Cell type:code id: tags:

 ``` python
 def pystencils_kernel():
    kernel(src=input_arr, dst=output_arr)
 ```

 %% Cell type:code id: tags:

 ``` python
 %%timeit
 pystencils_kernel()
 ```

 %% Output

    548 μs ± 34.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

 %% Cell type:markdown id: tags:

 This benchmark shows that *pystencils* is a lot faster than pure *numpy*, especially for large arrays.
 If you are interested in performance details and comparison to other packages like Cython, have a look at [this page](demo_benchmark.ipynb).


 %% Cell type:markdown id: tags:

 ## Short *sympy* introduction

 In this tutorial we continue with a short *sympy* introduction, since the symbolic kernel definition is built on top of this package. If you already know *sympy* you can skip this section.
 You can also read the full [sympy documentation here](http://docs.sympy.org/latest/index.html).

 %% Cell type:code id: tags:

 ``` python
 import sympy as sp
 sp.init_printing()  # enable nice LaTeX output
 ```

 %% Cell type:markdown id: tags:

 *sympy* is a package for symbolic calculation. So first we need some symbols:

 %% Cell type:code id: tags:

 ``` python
 x = sp.Symbol("x")
 y = sp.Symbol("y")
 type(x)
 ```

 %% Output

    sympy.core.symbol.Symbol

 %% Cell type:markdown id: tags:

 The usual mathematical operations are defined for symbols:

 %% Cell type:code id: tags:

 ``` python
 expr = x**2 * ( y + x + 5) + x**2
 expr
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2}$
     2                2
    x ⋅(x + y + 5) + x

 %% Cell type:markdown id: tags:

 Now we can do all sorts of operations on these expressions: expand them, factor them, substitute variables:

 %% Cell type:code id: tags:

 ``` python
 expr.expand()
 ```

 %% Output


    $\displaystyle x^{3} + x^{2} y + 6 x^{2}$
     3    2        2
    x  + x ⋅y + 6⋅x

 %% Cell type:code id: tags:

 ``` python
 expr.factor()
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 6\right)$
     2
    x ⋅(x + y + 6)

 %% Cell type:code id: tags:

 ``` python
 expr.subs(y, sp.cos(x))
 ```

 %% Output


    $\displaystyle x^{2} \left(x + \cos{\left(x \right)} + 5\right) + x^{2}$
     2                     2
    x ⋅(x + cos(x) + 5) + x

 %% Cell type:markdown id: tags:

 We can also built equations and solve them

 %% Cell type:code id: tags:

 ``` python
 eq = sp.Eq(expr, 1)
 eq
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2} = 1$
     2                2
    x ⋅(x + y + 5) + x  = 1

 %% Cell type:code id: tags:

 ``` python
 sp.solve(sp.Eq(expr, 1), y)
 ```

 %% Output


    $\displaystyle \left[ - x - 6 + \frac{1}{x^{2}}\right]$
    ⎡         1 ⎤
    ⎢-x - 6 + ──⎥
    ⎢          2⎥
    ⎣         x ⎦

 %% Cell type:markdown id: tags:

 A *sympy* expression is represented by an abstract syntax tree (AST), which can be inspected and modified.

 %% Cell type:code id: tags:

 ``` python
 expr
 ```

 %% Output


    $\displaystyle x^{2} \left(x + y + 5\right) + x^{2}$
     2                2
    x ⋅(x + y + 5) + x

 %% Cell type:code id: tags:

 ``` python
 ps.to_dot(expr, graph_style={'size': "9.5,12.5"} )
 ```

 %% Output


    <graphviz.sources.Source at 0x7e3154f58d30>

 %% Cell type:markdown id: tags:

 Programatically the children node type is acessible as ``expr.func`` and its children as ``expr.args``.
 With these members a tree can be traversed and modified.

 %% Cell type:code id: tags:

 ``` python
 expr.func
 ```

 %% Output

    sympy.core.add.Add

 %% Cell type:code id: tags:

 ``` python
 expr.args
 ```

 %% Output


    $\displaystyle \left( x^{2}, \  x^{2} \left(x + y + 5\right)\right)$
    ⎛ 2   2            ⎞
    ⎝x , x ⋅(x + y + 5)⎠

 %% Cell type:markdown id: tags:

 ## Using *pystencils*


 ### Fields

 *pystencils* is a module to generate code for stencil operations.
 One has to specify an update rule for each element of an array, with optional dependencies to neighbors.
 This is done use pure *sympy* with one addition: **Fields**.

 Fields represent a multidimensional array, where some dimensions are considered *spatial*, and some as *index* dimensions. Spatial coordinates are given relative (i.e. one can specify "the current cell" and "the left neighbor") whereas index dimensions are used to index multiple values per cell.

 %% Cell type:code id: tags:

 ``` python
 my_field = ps.fields("f(3) : double[2D]")
 ```

 %% Cell type:markdown id: tags:

 Neighbors are labeled according to points on a compass where the first coordinate is west/east, second coordinate north/south and third coordinate top/bottom.

 %% Cell type:code id: tags:

 ``` python
 field_access = my_field[1, 0](1)
 field_access
 ```

 %% Output


    $\displaystyle {f}_{(1,0)}^{1}$
    f_E__1

 %% Cell type:markdown id: tags:

 The result of indexing a field is an instance of ``Field.Access``. This class is a subclass of a *sympy* Symbol and thus can be used whereever normal symbols can be used. It is just like a normal symbol with some additional information attached to it.

 %% Cell type:code id: tags:

 ``` python
 isinstance(field_access, sp.Symbol)
 ```

 %% Output

    True

 %% Cell type:markdown id: tags:

 ### Building our first stencil kernel

 Lets start by building a simple filter kernel. We create a field representing an image, then define a edge detection filter on the third pixel component which is blue for an RGB image.

 %% Cell type:code id: tags:

 ``` python
 img_field = ps.fields("img(4): [2D]")
 ```

 %% Cell type:code id: tags:

 ``` python
 w1, w2 = sp.symbols("w_1 w_2")
 color = 2
 sobel_x = (-w2 * img_field[-1,0](color) - w1 * img_field[-1,-1](color) - w1 * img_field[-1, +1](color) \
           +w2 * img_field[+1,0](color) + w1 * img_field[+1,-1](color) - w1 * img_field[+1, +1](color))**2
 sobel_x
 ```

 %% Output


    $\displaystyle \left({img}_{(1,0)}^{2} w_{2} - {img}_{(1,1)}^{2} w_{1} - {img}_{(-1,1)}^{2} w_{1} + {img}_{(1,-1)}^{2} w_{1} - {img}_{(-1,-1)}^{2} w_{1} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    (img_E__2⋅w₂ - img_NE__2⋅w₁ - img_NW__2⋅w₁ + img_SE__2⋅w₁ - img_SW__2⋅w₁ - img
    
             2
    _W__2⋅w₂)

 %% Cell type:markdown id: tags:

 We have mixed some standard *sympy* symbols into this expression to possibly give the different directions different weights. The complete expression is still a valid *sympy* expression, so all features of *sympy* work on it. Lets for example now fix one weight by substituting it with a constant.

 %% Cell type:code id: tags:

 ``` python
 sobel_x = sobel_x.subs(w1, 0.5)
 sobel_x
 ```

 %% Output


    $\displaystyle \left({img}_{(1,0)}^{2} w_{2} - 0.5 {img}_{(1,1)}^{2} - 0.5 {img}_{(-1,1)}^{2} + 0.5 {img}_{(1,-1)}^{2} - 0.5 {img}_{(-1,-1)}^{2} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    (img_E__2⋅w₂ - 0.5⋅img_NE__2 - 0.5⋅img_NW__2 + 0.5⋅img_SE__2 - 0.5⋅img_SW__2 -
    
                 2
     img_W__2⋅w₂)

 %% Cell type:markdown id: tags:

 Now lets built an executable kernel out of it, which writes the result to a second field. Assignments are created using *pystencils* `Assignment` class, that gets the left- and right hand side of the assignment.

 %% Cell type:code id: tags:

 ``` python
 dst_field = ps.fields('dst: [2D]' )
 update_rule = ps.Assignment(dst_field[0,0], sobel_x)
 update_rule
 ```

 %% Output


    $\displaystyle {dst}_{(0,0)} \leftarrow_{} \left({img}_{(1,0)}^{2} w_{2} - 0.5 {img}_{(1,1)}^{2} - 0.5 {img}_{(-1,1)}^{2} + 0.5 {img}_{(1,-1)}^{2} - 0.5 {img}_{(-1,-1)}^{2} - {img}_{(-1,0)}^{2} w_{2}\right)^{2}$
    
    dst_C := (img_E__2⋅w₂ - 0.5⋅img_NE__2 - 0.5⋅img_NW__2 + 0.5⋅img_SE__2 - 0.5⋅im
    
                          2
    g_SW__2 - img_W__2⋅w₂)

 %% Cell type:markdown id: tags:

 Next we can see *pystencils* in action which creates a kernel for us.

 %% Cell type:code id: tags:

 ``` python
 from pystencils import create_kernel
 ast = create_kernel(update_rule, cpu_openmp=False)
 compiled_kernel = ast.compile()
 ```

 %% Output

    /media/data/fhennig/research-hpc/projects/2024_pystencils_nbackend/pystencils/src/pystencils/config.py:327: FutureWarning: The `cpu_openmp` option of CreateKernelConfig is deprecated and will be removed in pystencils 2.1. Use `cpu_optim.openmp` instead.
      warn(

 %% Cell type:markdown id: tags:

 This compiled kernel is now just an ordinary Python function.
 Now lets grab an image to apply this filter to:

 %% Cell type:code id: tags:

 ``` python
 try:
    import requests
    import imageio.v2 as imageio
    from io import BytesIO

    response = requests.get("https://www.python.org/static/community_logos/python-logo-master-v3-TM.png")
    img = imageio.imread(BytesIO(response.content)).astype(np.double)
    img /= img.max()
    plt.imshow(img);
 except ImportError:
    print("No requests or imageio installed")
    img = np.random.random((82, 290, 4))
 ```

 %% Output

    No requests or imageio installed

 %% Cell type:code id: tags:

 ``` python
 filtered_image = np.zeros_like(img[..., 0])
 # here we call the compiled stencil function
 compiled_kernel(img=img, dst=filtered_image, w_2=0.5)
 plt.imshow(filtered_image, cmap='gray');
 ```

 %% Output



 %% Cell type:markdown id: tags:

 ### Digging into *pystencils*

 On our way we have created an ``ast``-object. We can inspect this, to see what *pystencils* actually does.

 %% Cell type:code id: tags:

 ``` python
 #   TODO nbackend
 # ps.to_dot(ast, graph_style={'size': "9.5,12.5"})
 ```

 %% Cell type:markdown id: tags:

 *pystencils* also builds a tree structure of the program, where each `Assignment` node internally again has a *sympy* AST which is not printed here. Out of this representation *C* code can be generated:

 %% Cell type:code id: tags:

 ``` python
 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Behind the scenes this code is compiled into a shared library and made available as a Python function. Before compiling this function we can modify the AST object, for example to parallelize it with OpenMP.

 %% Cell type:code id: tags:

 ``` python
 ast = ps.create_kernel(
    update_rule,
-    cpu_optim = ps.config.CpuOptimConfig(
-        openmp=ps.config.OpenMpConfig(num_threads=2))
+    cpu_optim = ps.CpuOptimConfig(
+        openmp=ps.OpenMpConfig(num_threads=2))
    )

 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 ### Fixed array sizes

 Since we already know the arrays to which the kernel should be applied, we can
 create *Field* objects with fixed size, based on a numpy array:

 %% Cell type:code id: tags:

 ``` python
 img_field, dst_field = ps.fields("I(4), dst : [2D]", I=img.astype(np.double), dst=filtered_image)

 sobel_x = -2 * img_field[-1,0](1) - img_field[-1,-1](1) - img_field[-1, +1](1) \
         +2 * img_field[+1,0](1) + img_field[+1,-1](1) - img_field[+1, +1](1)
 update_rule = ps.Assignment(dst_field[0,0], sobel_x)

 ast = create_kernel(update_rule)
 ps.show_code(ast)
 ```

 %% Output



 %% Cell type:markdown id: tags:

 Compare this code to the version above. In this code the loop bounds and array offsets are constants, which usually leads to faster kernels.

 %% Cell type:markdown id: tags:

 ### Running on GPU

 If you have a GPU and [cupy](https://cupy.dev/) installed, *pystencils* can run your kernel on the GPU as well. You can find more details about this in the GPU tutorial.

 %% Cell type:code id: tags:

 ``` python
 try:
    import cupy
    from pystencils.gpu import BlockIndexing

    gpu_ast = create_kernel(update_rule, target=ps.Target.GPU,
                            gpu_indexing=BlockIndexing,
                            gpu_indexing_params={'blockSize': (64, 1, 1)})

    ps.show_code(gpu_ast)
 except ImportError:
    print("Please install cupy for GPU support")
 ```

 %% Output

    Please install cupy for GPU support

--- a/pytest.ini
+++ b/pytest.ini
@@ -40,6 +40,7 @@ omit = doc/*
       src/pystencils/cache.py
       src/pystencils/pacxx/benchmark.py
       src/pystencils/_version.py
+       src/pystencils/_deprecation.py
       src/pystencils/old
       venv/

@@ -62,6 +63,9 @@ exclude_lines =
       if False:
       if __name__ == .__main__.:

+       # Don't cover type checking imports
+       if TYPE_CHECKING:
+
 skip_covered = True
 fail_under = 80


--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""

-from .target import Target
+from .codegen import (
+    Target,
+    CreateKernelConfig,
+    CpuOptimConfig,
+    VectorizationConfig,
+    OpenMpConfig,
+    GpuIndexingConfig,
+    AUTO
+)
 from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
@@ -9,17 +17,10 @@ from .inspection import inspect
 from .field import Field, FieldType, fields
 from .types import create_type, create_numeric_type
 from .cache import clear_cache
-from .config import (
-    CreateKernelConfig,
-    CpuOptimConfig,
-    VectorizationConfig,
-    OpenMpConfig,
-    GpuIndexingConfig,
-)
 from .kernel_decorator import kernel, kernel_config
-from .kernelcreation import create_kernel, create_staggered_kernel
-from .backend.kernelfunction import KernelFunction
-from .backend.jit import no_jit
+from .codegen.driver import create_kernel, create_staggered_kernel
+from .codegen import Kernel
+from .jit import no_jit
 from .backend.exceptions import KernelConstraintsError
 from .slicing import make_slice
 from .spatial_coordinates import (
@@ -53,9 +54,10 @@ __all__ = [
    "VectorizationConfig",
    "GpuIndexingConfig",
    "OpenMpConfig",
+    "AUTO",
    "create_kernel",
    "create_staggered_kernel",
-    "KernelFunction",
+    "Kernel",
    "KernelConstraintsError",
    "Target",
    "no_jit",

--- a/src/pystencils/_deprecation.py
+++ b/src/pystencils/_deprecation.py
@@ -5,4 +5,5 @@ def _deprecated(feature, instead, version="2.1"):
        f"{feature} is deprecated and will be removed in pystencils {version}."
        f"Use {instead} instead.",
        DeprecationWarning,
+        stacklevel=2
    )
--- a/src/pystencils/backend/__init__.py
+++ b/src/pystencils/backend/__init__.py
-from .kernelfunction import (
-    KernelParameter,
-    KernelFunction,
-    GpuKernelFunction,
-)
-
-from .constraints import KernelParamsConstraint
-
-__all__ = [
-    "KernelParameter",
-    "KernelFunction",
-    "GpuKernelFunction",
-    "KernelParamsConstraint",
-]
--- a/src/pystencils/backend/constraints.py
+++ b/src/pystencils/backend/constraints.py
-from __future__ import annotations
-
-from typing import Any, TYPE_CHECKING
-from dataclasses import dataclass
-
-if TYPE_CHECKING:
-    from .kernelfunction import KernelParameter
-
-
-@dataclass
-class KernelParamsConstraint:
-    condition: Any  # FIXME Implement conditions
-    message: str = ""
-
-    def to_code(self):
-        raise NotImplementedError()
-
-    def get_parameters(self) -> set[KernelParameter]:
-        raise NotImplementedError()
-
-    def __str__(self) -> str:
-        return f"{self.message} [{self.condition}]"
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
 from __future__ import annotations
 from enum import Enum
 from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING

-from ...target import Target
+from ...codegen import Target

 from ..ast.structural import (
    PsAstNode,
@@ -59,7 +60,8 @@ from ..memory import PsSymbol
 from ..constants import PsConstant
 from ...types import PsType

-from ..kernelfunction import KernelFunction, GpuKernelFunction
+if TYPE_CHECKING:
+    from ...codegen import Kernel


 class EmissionError(Exception):
@@ -172,8 +174,9 @@ class BasePrinter(ABC):
    def __init__(self, indent_width=3):
        self._indent_width = indent_width

-    def __call__(self, obj: PsAstNode | KernelFunction) -> str:
-        if isinstance(obj, KernelFunction):
+    def __call__(self, obj: PsAstNode | Kernel) -> str:
+        from ...codegen import Kernel
+        if isinstance(obj, Kernel):
            sig = self.print_signature(obj)
            body_code = self.visit(obj.body, PrinterCtx())
            return f"{sig}\n{body_code}"
@@ -372,7 +375,7 @@ class BasePrinter(ABC):
                    f"BasePrinter does not know how to print {type(node)}"
                )

-    def print_signature(self, func: KernelFunction) -> str:
+    def print_signature(self, func: Kernel) -> str:
        prefix = self._func_prefix(func)
        params_str = ", ".join(
            f"{self._type_str(p.dtype)} {p.name}" for p in func.parameters
@@ -380,8 +383,10 @@ class BasePrinter(ABC):
        signature = " ".join([prefix, "void", func.name, f"({params_str})"])
        return signature

-    def _func_prefix(self, func: KernelFunction):
-        if isinstance(func, GpuKernelFunction) and func.target == Target.CUDA:
+    def _func_prefix(self, func: Kernel):
+        from ...codegen import GpuKernel
+
+        if isinstance(func, GpuKernel) and func.target == Target.CUDA:
            return "__global__"
        else:
            return "FUNC_PREFIX"
No results found