Compare revisions

Frederik Hennig · Frederik Hennig · Frederik Hennig · Michael Zikeli · Michael Zikeli · Frederik Hennig
--- a/.flake8
+++ b/.flake8
@@ -4,4 +4,4 @@ exclude=src/pystencils/jupyter.py,
        src/pystencils/plot.py
        src/pystencils/session.py
        src/pystencils/old
-ignore = W293 W503 W291 C901 E741
+ignore = W293 W503 W291 C901 E741 E704
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -318,7 +318,7 @@ tests-and-coverage:


 build-documentation:
-  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/documentation
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
  stage: docs
  needs: []
  before_script:
@@ -328,6 +328,7 @@ build-documentation:
    - make html SPHINXOPTS="-W --keep-going"
  tags:
    - docker
+    - cuda11
  artifacts:
    paths:
      - docs/build/html

--- a/conftest.py
+++ b/conftest.py
@@ -203,3 +203,8 @@ else:
                return IPyNbFile.from_parent(fspath=path, parent=parent)
            else:
                return IPyNbFile(path, parent)
+
+
+#   Fixtures
+
+from tests.fixtures import *
--- a/docs/.gitignore
+++ b/docs/.gitignore
 build

 #   sphinx.ext.autosummary generated files
-**/autoapi
+**/generated
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,8 @@ help:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	rm -rf source/reference/generated
+	rm -rf source/backend/generated
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/docs/source/_static/img/logo copy.png
+++ b/docs/source/_static/img/logo copy.png
--- a/docs/source/_static/img/logo_large.svg
+++ b/docs/source/_static/img/logo_large.svg
--- a/docs/source/_static/img/pystencils-logo-light.svg
+++ b/docs/source/_static/img/pystencils-logo-light.svg
--- a/docs/source/backend/ast.rst
+++ b/docs/source/backend/ast.rst
@@ -20,7 +20,7 @@ Base Classes
 .. module:: pystencils.backend.ast.astnode

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/entire_class.rst

@@ -34,7 +34,7 @@ Structural Nodes
 .. module:: pystencils.backend.ast.structural

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/entire_class.rst

@@ -55,7 +55,7 @@ Expressions
 .. module:: pystencils.backend.ast.expressions

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/entire_class.rst

@@ -108,7 +108,7 @@ SIMD Nodes
 .. module:: pystencils.backend.ast.vector

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/entire_class.rst

@@ -123,7 +123,7 @@ Utility
 .. currentmodule:: pystencils.backend.ast

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    expressions.evaluate_expression

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -4,7 +4,6 @@ import re
 from pystencils import __version__ as pystencils_version

 project = "pystencils"
-html_logo = "_static/img/logo.png"
 html_title = "pystencils Documentation"

 copyright = (
@@ -33,9 +32,9 @@ extensions = [
    "sphinx.ext.mathjax",
    "sphinx.ext.napoleon",
    "sphinx.ext.inheritance_diagram",
-    "nbsphinx",
    "sphinxcontrib.bibtex",
    "sphinx_autodoc_typehints",
+    "myst_nb",
 ]

 templates_path = ["_templates"]
@@ -49,6 +48,7 @@ intersphinx_mapping = {
    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
    "matplotlib": ("https://matplotlib.org/", None),
    "sympy": ("https://docs.sympy.org/latest/", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
 }

 # -- Options for inheritance diagrams-----------------------------------------
@@ -57,18 +57,33 @@ inheritance_graph_attrs = {
    "bgcolor": "white",
 }

+# -- Options for MyST / MyST-NB ----------------------------------------------
+
+nb_execution_mode = "off"  # do not execute notebooks by default
+
+myst_enable_extensions = [
+    "dollarmath",
+    "colon_fence",
+]
+
 # -- Options for HTML output -------------------------------------------------

-html_theme = "furo"
+html_theme = "sphinx_book_theme"
 html_static_path = ["_static"]
 html_css_files = [
-    'css/fixtables.css',
+    "css/fixtables.css",
 ]
+html_theme_options = {
+    "logo": {
+        "image_light": "_static/img/pystencils-logo-light.svg",
+        "image_dark": "_static/img/pystencils-logo-dark.svg",
+    }
+}

 # NbSphinx configuration

-nbsphinx_execute = 'never'
-nbsphinx_codecell_lexer = 'python3'
+nbsphinx_execute = "never"
+nbsphinx_codecell_lexer = "python3"

 #   BibTex
-bibtex_bibfiles = ['pystencils.bib']
+bibtex_bibfiles = ["pystencils.bib"]
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -2,15 +2,6 @@
 pystencils v2.0-dev Documentation
 #################################

-.. toctree::
-   :maxdepth: 1
-   :hidden:
-   
-   tutorials/index
-   reference/index
-   migration
-   backend/index
-
 .. note::
   You are currently viewing the documentation pages for the development revision |release|
   of pystencils 2.0.
@@ -39,7 +30,7 @@ Its features include:
  and take control of numerical precision using the `versatile type system <page_type_system>`.
 - **Kernel Description:** Derive and optimize stencil-based update rules using a symbolic abstraction
  of numerical `fields <page_symbolic_language>`.
- **Code Generation:** `Generate and compile <page_kernel_creation>` high-performance parallel kernels for CPUs and GPUs.
+- **Code Generation:** `Generate and compile <guide_kernelcreation>` high-performance parallel kernels for CPUs and GPUs.
  Accelerate your kernels on multicore CPUs using the automatic OpenMP parallelization
  and make full use of your cores' SIMD units through the highly configurable vectorizer.
 - **Rapid Prototyping:** Run your numerical solvers on `NumPy <https://numpy.org>`_ and `CuPy <https://cupy.dev>`_ arrays
@@ -50,32 +41,56 @@ Its features include:
  such as `waLBerla`_ to build massively parallel simulations.


-Contents
--------
+.. .. card:: Getting Started: Our Tutorials
+..    :link: page_tutorials
+..    :link-type: ref

-.. card:: Getting Started: Our Tutorials
-   :link: page_tutorials
-   :link-type: ref
+..    New to *pystencils*? Check out our set of tutorials to quickly and interactively learn the basics.

-   New to *pystencils*? Check out our set of tutorials to quickly and interactively learn the basics.
+.. .. card:: Reference Guide and APIs
+..    :link: page_api
+..    :link-type: ref

-.. card:: Reference Guide and APIs
-   :link: page_api
-   :link-type: ref
+..    Get an overview of *pystencils*' APIs for mathematical modelling and code generation.

-   Get an overview of *pystencils*' APIs for mathematical modelling and code generation.
+.. .. card:: Migration Guide: 1.3.x to 2.0
+..    :link: page_v2_migration
+..    :link-type: ref

-.. card:: Migration Guide: 1.3.x to 2.0
-   :link: page_v2_migration
-   :link-type: ref
+..    Find advice on migrating your code from *pystencils 1.3.x* to *pystencils 2.0*

-   Find advice on migrating your code from *pystencils 1.3.x* to *pystencils 2.0*
+.. .. card:: Developers's Reference: Code Generation Backend
+..    :link: page_codegen_backend
+..    :link-type: ref

-.. card:: Developers's Reference: Code Generation Backend
-   :link: page_codegen_backend
-   :link-type: ref
+..    Dive deep into the core of pystencils' code generation engine.

-   Dive deep into the core of pystencils' code generation engine.
+Topics
+------
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   installation
+   tutorials/index
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Reference Guides
+
+   reference/symbolic_language
+   reference/kernelcreation
+   reference/gpu_kernels
+   reference/types
+   reference/api/index
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced
+
+   migration
+   backend/index

 Projects using pystencils
 -------------------------

--- a/docs/source/installation.md
+++ b/docs/source/installation.md
+(_installation)=
+# Setup and Installation
+
+## Install pystencils
+
+There are two ways to install the latest development version of pystencils 2.0.
+You can either install it directly from our git repository:
+
+```bash
+pip install "git+https://i10git.cs.fau.de/pycodegen/pystencils.git@v2.0-dev"
+```
+
+Or clone the repository locally and perform an editable install:
+
+```bash
+git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git
+pip install -e pystencils
+```
+
+### Feature Groups
+
+In both cases, you can add a set of optional features to your installation by listing them
+in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`).
+The following feature sets are available:
+
+- `interactive` (**recommended**): Install dependencies for using pystencils interactively from
+  within Jupyter notebooks.
+  Setting this flag will cause pip to install `jupyter`, `matplotlib`, and `graphviz`, among others, alongside pystencils.
+- `alltrafos` (**recommended**): Install dependencies to enable a wider set of code transformation.
+  These include [islpy](https://pypi.org/project/islpy/) for polyhedral loop transformations,
+  and [py-cpuinfo](https://pypi.org/project/py-cpuinfo/) for detecting the current hardware in order
+  to select optimal vector instructions.
+- `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils
+  to accelerate the setup of boundary conditions.
+
+:::{dropdown} For Developers
+
+If you are developing pystencils, we recommend you perform an editable install of your
+local clone of the repository, with all optional features:
+```bash
+pip install -e pystencils[alltrafos,interactive,use_cython,doc,tests]
+```
+
+This includes the additional feature groups `doc`, which contains all dependencies required
+to build this documentation, and `tests`, which adds `flake8` for code style checking,
+`mypy` for static type checking, and `pytest` plus plugins for running the test suite.
+:::
+
+### For Nvidia GPUs
+
+If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile
+and execute kernels running on your GPU.
+This requires a working installation of [cupy](https://cupy.dev).
+Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html)
+for details about installing cupy.
--- a/docs/source/reference/api/codegen.rst
+++ b/docs/source/reference/api/codegen.rst
+Code Generator and Configuration
+================================
+
+.. module:: pystencils.kernelcreation
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  create_kernel
+
+.. module:: pystencils.config
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  CreateKernelConfig
+  CpuOptimConfig
+  OpenMpConfig
+  VectorizationConfig
+  GpuIndexingConfig
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+
+  AUTO
\ No newline at end of file
--- a/docs/source/reference/field.rst
+++ b/docs/source/reference/field.rst
@@ -14,7 +14,7 @@ Creating Fields
 ---------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    fields
@@ -30,7 +30,7 @@ Name and Element Type
 ^^^^^^^^^^^^^^^^^^^^^

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated

    Field.name
    Field.dtype
@@ -40,7 +40,7 @@ Dimensionality, Shape, and Memory Layout
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    
    Field.ndim
    Field.values_per_cell
@@ -58,7 +58,7 @@ Accessing Field Entries
 -----------------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    Field.center

--- a/docs/source/reference/api/index.rst
+++ b/docs/source/reference/api/index.rst
+***
+API
+***
+
+Modules
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    field
+    sympyextensions
+    codegen
--- a/docs/source/reference/sympyextensions.rst
+++ b/docs/source/reference/sympyextensions.rst
@@ -7,7 +7,7 @@ Symbol Factory
 --------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    SymbolCreator
@@ -17,7 +17,7 @@ Functions
 ---------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    math.prod
@@ -31,7 +31,7 @@ Expression Analysis
 -------------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    math.is_constant
@@ -46,7 +46,7 @@ Expression Rewriting and Simplifications
 ----------------------------------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:

    math.remove_small_floats
@@ -78,7 +78,7 @@ Integer Operations
 ------------------

 .. autosummary::
-    :toctree: autoapi
+    :toctree: generated
    :nosignatures:
    :template: autosummary/sympy_class.rst


--- a/docs/source/reference/gpu_kernels.md
+++ b/docs/source/reference/gpu_kernels.md
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+(guide_gpukernels)=
+# Pystencils for GPUs
+
+Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
+as well as just-in-time compilation and execution of CUDA kernels from within Python
+based on the [cupy] library.w
+This section's objective is to give a detailed introduction into the creation of
+GPU kernels with pystencils.
+
+## Generate, Compile and Run CUDA Kernels
+
+In order to obtain a CUDA implementation of a symbolic kernel, naught more is required
+than setting the {any}`target <CreateKernelConfig.target>` code generator option to
+{any}`Target.CUDA`:
+
+```{code-cell} ipython3
+f, g = ps.fields("f, g: float64[3D]")
+update = ps.Assignment(f.center(), 2 * g.center())
+
+cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+kernel = ps.create_kernel(update, cfg)
+
+ps.inspect(kernel)
+```
+
+The `kernel` object returned by the code generator in above snippet is an instance
+of the {py:class}`GpuKernelFunction` class.
+It extends {py:class}`KernelFunction` with some GPU-specific information.
+In particular, it defines the {any}`threads_range <GpuKernelFunction.threads_range>`
+property, which tells us how many threads the kernel is expecting to be executed with:
+
+```{code-cell} ipython3
+kernel.threads_range
+```
+
+If a GPU is available and [CuPy][cupy] is installed in the current environment,
+the kernel can be compiled and run immediately.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field.
+
+:::{note}
+[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
+which operates much in the same way that [NumPy][numpy] works on CPU arrays.
+Cupy and NumPy expose nearly the same APIs for array operations;
+the difference being that CuPy allocates all its arrays on the GPU
+and performs its operations as CUDA kernels.
+Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvcc].
+In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
+and to allocate and manage the data these kernels can be executed on.
+
+For more information on CuPy, refer to [their documentation][cupy-docs].
+:::
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
+
+### Modifying the Launch Grid
+
+The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
+This object holds the kernel's launch grid configuration
+(i.e. the number of thread blocks, and the number of threads per block.)
+Pystencils specifies a default value for the block size and if possible, 
+the number of blocks is automatically inferred in order to cover the entire iteration space.
+In addition, the wrapper's interface allows us to customize the GPU launch grid,
+by manually setting both the number of threads per block, and the number of blocks on the grid:
+
+```{code-cell} ipython3
+kfunc.block_size = (16, 8, 8)
+kfunc.num_blocks = (1, 2, 2)
+```
+
+For most kernels, setting only the `block_size` is sufficient since pystencils will
+automatically compute the number of blocks;
+for exceptions to this, see [](#manual_launch_grids).
+If `num_blocks` is set manually and the launch grid thus specified is too small, only
+a part of the iteration space will be traversed by the kernel;
+similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle.
+
+(manual_launch_grids)=
+### Manual Launch Grids and Non-Cuboid Iteration Patterns
+
+In some cases, it will be unavoidable to set the launch grid size manually;
+especially if the code generator is unable to automatically determine the size of the
+iteration space.
+An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration).
+Let's set it up once more:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f = ps.fields("f: float64[2D]")
+assignments = [
+    ps.Assignment(f(0), 1)
+]
+```
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    target=ps.Target.CUDA,
+    iteration_slice=ps.make_slice[:, y:]
+)
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
+```
+
+This warns us that the threads range could not be determined automatically.
+We can disable this warning by setting `manual_launch_grid` in the GPU indexing options:
+
+```{code-cell}
+cfg = ps.CreateKernelConfig(
+    # ... other options ...
+    gpu_indexing=ps.GpuIndexingConfig(
+        manual_launch_grid=True
+    )
+)
+```
+
+Now, to execute our kernel, we have to manually specify its launch grid:
+
+```{code-cell} ipython3
+kernel.block_size = (8, 8)
+kernel.num_blocks = (2, 2)
+```
+
+This way the kernel will cover this iteration space:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+We can also observe the effect of decreasing the launch grid size:
+
+```{code-cell} ipython3
+kernel.block_size = (4, 4)
+kernel.num_blocks = (2, 3)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+f_arr = cp.zeros((16, 16))
+kernel(f=f_arr)
+_draw_ispace(cp.asnumpy(f_arr))
+```
+
+Here, since there are only eight threads operating in $x$-direction, 
+and twelve threads in $y$-direction,
+only a part of the triangle is being processed.
+
+## API Reference
+
+```{eval-rst}
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  pystencils.backend.kernelfunction.GpuKernelFunction
+  pystencils.backend.jit.gpu_cupy.CupyKernelWrapper
+```
+
+:::{admonition} Developers To Do:
+
+- Fast approximation functions
+- Fp16 on GPU
+:::
+
+
+[cupy]: https://cupy.dev "CuPy Homepage"
+[numpy]: https://numpy.org "NumPy Homepage"
+[nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver"
+[cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
\ No newline at end of file
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
-.. _page_api:
-
-###############
-Reference Guide
-###############
-
-These pages list the public APIs of pystencils, with advice on how to use them.
-
-.. toctree::
-    :maxdepth: 2
-
-    symbolic_language
-    kernelcreation
-    types
--- a/docs/source/reference/kernelcreation.md
+++ b/docs/source/reference/kernelcreation.md
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import matplotlib.pyplot as plt
+```
+
+(guide_kernelcreation)=
+# Kernel Creation
+
+Once a kernel's assignments are fully assembled, they need to be passed through pystencils' code
+generation engine in order to produce the kernel's executable code.
+The goal of this chapter is to shed light on pystencils' main code generation pipeline.
+Here, we show how to invoke the code generator and discuss its various configuration options
+and their effects on the generated kernel.
+
+## Running the Code Generator
+
+The primary way to invoke the code generation engine is through the `create_kernel` function.
+It takes two arguments:
+- the list of assignment that make up the kernel (optionally wrapped as an ``AssignmentCollection``),
+- and a configuration object, an instance of {any}`CreateKernelConfig <pystencils.config.CreateKernelConfig>`.
+
+```{eval-rst}
+.. autosummary::
+  :nosignatures:
+
+  pystencils.kernelcreation.create_kernel
+  pystencils.config.CreateKernelConfig
+```
+
+For a simple kernel, an invocation of the code generator might look like this:
+
+```{code-cell} ipython3
+# Symbol and field definitions
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+# Kernel definition
+update = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+
+# Code Generator Configuration
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.CUDA,
+  default_dtype="float32",
+  ghost_layers=1
+)
+
+kernel = ps.create_kernel(update, cfg)
+```
+
+The above snippet defines a five-point-stencil Jacobi update. A few noteworthy things are going on:
+- The target data type of the kernel is to be `float32`.
+  This is explicitly specified for the three fields `u`, `u_tmp` and `f`.
+  For the symbol `h`, this is left implicit; `h` therefore learns its data type from the `default_dtype` configuration option.
+- The target hardware for this kernel are Nvidia GPUs; this is reflected by the `target` property being set to `Target.CUDA`.
+- As the five-point stencil reads data from neighbors offset by one cell, it can not be legally executed on the outermost
+  layer of nodes of the fields' 2D arrays. Here, we ensure that these outer layers are excluded by setting `ghost_layers=1`.
+  This is not strictly necessary, since the code generator could infer that information by itself.
+
+## Inspecting the Generated Code
+
+The object returned by the code generator, here named `kernel`, is an instance of the {any}`KernelFunction` class.
+This object stores the kernel's name, its list of parameters, the set of fields it operates on, and its hardware target.
+Also, it of course holds the kernel itself, in the form of an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST).
+This tree can be printed out as compilable code in the target language (C++ or, in this case, CUDA),
+but holds much more information than the printed-out code string.
+When working in a Jupyter notebook, we can interactively inspect the kernel using pystencils' `inspect` function.
+This reveals a widget that allows us investigate various details about the kernel:
+- its general properties, such as name, parameters, fields, target, ...;
+- its code, represented in the *pystencils IR syntax*;
+- the same code, in native C++/CUDA syntax;
+- and a visual representation of its abstract syntax tree.
+
+```{code-cell} ipython3
+ps.inspect(kernel)
+```
+
+## Configuring the Code Generator
+
+The code generation engine can be configured using a wide range of options.
+This section aims at explaining the majority of these options,
+their interaction and effects, use cases and caveats.
+
+
+### Target Specification
+
+Pystencils supports code generation for a variety of CPU and GPU hardware.
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.target
+
+.. module:: pystencils.target
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/recursive_class.rst
+
+  Target
+```
+
+### Data Types
+
+To produce valid output code, the code generator has to figure out the data types of each
+symbol, expression, and assignment occuring inside a kernel.
+This happens roughly according to the following rules:
+ - **Field Accesses:** Each field has a fixed data type set at its creation, which is also applied to
+   each access to that field.
+ - **Symbols:** Symbols obtain their data types from two sources. 
+   A symbol occuring first on the left-hand side of an assignment receives the data type that
+   was inferred for the right-hand side expression of that assignment.
+   Symbols occuring first inside some expression on the right-hand side of an assignment, on the other
+   hand, receive the {any}`default_dtype <CreateKernelConfig.default_dtype>` set in the {any}`CreateKernelConfig`.
+
+We can observe this behavior by setting up a kernel including several fields with different data types:
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import CastFunc
+
+f = ps.fields("f: float32[2D]")
+g = ps.fields("g: float16[2D]")
+
+x, y, z = sp.symbols("x, y, z")
+
+assignments = [
+  ps.Assignment(x, 42),
+  ps.Assignment(y, f(0) + x),
+  ps.Assignment(z, g(0))
+]
+
+cfg = ps.CreateKernelConfig(
+  default_dtype="float32",
+  index_dtype="int32"
+)
+
+kernel = ps.create_kernel(assignments, cfg)
+```
+
+We can take a look at the result produced by the code generator after parsing the above kernel.
+Inspecting the internal representation of the kernel's body and loop nest,
+we see that `x` has received the `float32` type,
+which was specified via the {py:data}`default_dtype <CreateKernelConfig.default_dtype>` option.
+The symbol `y`, on the other hand, has inherited its data type `float16` from the access to the field `g`
+on its declaration's right-hand side.
+Also, we can observe that the loop counters and symbols related to the field's memory layout
+are using the `int32` data type, as specified in {py:data}`index_dtype <CreateKernelConfig.index_dtype>`:
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+ps.inspect(driver.intermediates.materialized_ispace, show_cpp=False)
+```
+
+:::{note}
+To learn more about inspecting code after different stages of the code generator, refer to [this section](#section_codegen_stages).
+:::
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.default_dtype
+  CreateKernelConfig.index_dtype
+```
+
+### The Iteration Space
+
+The *domain fields* a kernel operates on are understood to reside on a common,
+one-, two- or three-dimensional computational grid.
+The grid points may be understood as vertices or cells, depending on the application.
+When executed, the kernel performs a computation and updates values on all, or a specific subset
+of, these grid points.
+The set of points the kernel actually operates on is defined by its *iteration space*.
+
+There are three distinct options to control the iteration space in the code generator,
+only one of which can be specified at a time:
+ - The ``ghost_layers`` option allows to specify a number of layers of grid points on all
+   domain borders that should be excluded from iteration;
+ - The ``iteration_slice`` option allows to describe the iteration space using Pythonic slice objects;
+ - The ``index_field`` option can be used to realize a sparse list-based iteration by passing a special
+   *index field* which holds a list of all points that should be processed.
+
+:::{note}
+  The points within a kernel's iteration space are understood to be processed concurrently and in
+  no particular order;
+  the output of any kernel that relies on some specific iteration order is therefore undefined.
+  (When running on a GPU, all grid points might in fact be processed in perfect simultaniety!)
+:::
+
+```{eval-rst}
+.. currentmodule:: pystencils.config
+
+.. autosummary::
+  :nosignatures:
+
+  CreateKernelConfig.ghost_layers
+  CreateKernelConfig.iteration_slice
+  CreateKernelConfig.index_field
+```
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _draw_ispace(f_arr):
+    n, m = f_arr.shape
+    fig, ax = plt.subplots()
+    
+    ax.set_xticks(np.arange(0, m, 4))
+    ax.set_yticks(np.arange(0, n, 4))
+    # ax.set_xticklabels([])
+    # ax.set_yticklabels([])
+
+    ax.set_xticks(np.arange(-.5, m, 1), minor=True)
+    ax.set_yticks(np.arange(-.5, n, 1), minor=True)
+    
+    ax.grid(which="minor", linewidth=2)
+    ax.tick_params(which='minor', bottom=False, left=False)
+    
+    ax.imshow(f_arr, interpolation="none", aspect="equal", origin="lower")
+```
+
+#### Specifying Ghost Layers
+
+One way to alter the iteration space is by introducing ghost layers on the domain borders.
+These layers of grid points are stripped from the iterations, and can be used to hold
+boundary values or exchange data in MPI-parallel simulations.
+
+##### Automatic Ghost Layers
+
+The easiest way to define an iteration space with ghost layers
+is to set `ghost_layers=ps.config.AUTO`, which is also the default
+when no iteration space options are specified.
+In this case, the code generator will examine the kernel to find the maximum range
+of its stencil -- that is, the maximum neighbor offset encountered in any field access.
+If, for instance, a neighbor node in $x$-direction with offset $k$ is accessed by the kernel,
+it cannot legally execute on the outermost $k$ layers of nodes in that direction since it would
+access memory out-of-bounds.
+Therefore, an automatic number of $k$ ghost layers at each domain border is inferred.
+As we can see in the example below, the number of inferred ghost layers at each domain border will be set to the maximum required in any dimension.
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+u, v = ps.fields("u, v: [2D]")
+```
+
+To illustrate, the following kernel accesses neighbor nodes with a maximum offset of two:
+
+```{code-cell} ipython3
+ranged_update = ps.Assignment(u.center(), v[-2, -1] + v[2, 1])
+
+cfg = ps.CreateKernelConfig(ghost_layers=ps.config.AUTO)
+kernel = ps.create_kernel(ranged_update, cfg)
+```
+
+With `ghost_layers=ps.config.AUTO`, its iteration space will look like this (yellow cells are included, purple cells excluded).
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+f = ps.fields("f: float64[2D]")
+assignments = [
+    ranged_update,
+    ps.Assignment(f(0), 1)
+]
+kernel = ps.create_kernel(assignments).compile()
+
+f_arr = np.zeros((16, 16))
+u_arr = np.zeros_like(f_arr)
+v_arr = np.zeros_like(f_arr)
+
+kernel(f=f_arr, u=u_arr, v=v_arr)
+
+_draw_ispace(f_arr)
+```
+
+##### Uniform and Nonuniform Ghost Layers
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+def _show_ispace(cfg):
+    f = ps.fields("f: float64[2D]")
+    assignments = [
+        ps.Assignment(f(0), 1)
+    ]
+    kernel = ps.create_kernel(assignments, cfg).compile()
+
+    f_arr = np.zeros((16, 16))
+    kernel(f=f_arr)
+
+    _draw_ispace(f_arr)
+```
+
+Setting `ghost_layers` to a number will remove that many layers from the iteration space in each dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=1
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+Ghost layers can also be specified individually for each dimension and lower/upper borders,
+by passing a sequence with either a single integer or a pair of integers per dimension:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    ghost_layers=[(2, 1), 3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+#### Iteration Slices
+
+Using the `iteration_slice` option, we can assert much finer control on the kernel's iteration space
+by specifying it using sequences of Python {py:class}`slice` objects.
+
+We can quickly create those using `ps.make_slice`, using the `start:stop:step` slice notation.
+The easiest case is to set the iteration space with fixed numerical limits:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[3:-4, 9:14]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+##### Strided Iteration
+
+It is also possible to set up a strided iteration that skips over a fixed number of elements.
+The following example processes only every second line in $y$-direction, using the slice `::2`:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[::2, 3:-3]
+)
+```
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+(example_triangular_iteration)=
+##### Triangular Iteration
+
+Iteration slices are not limited to constant numerical values; they can be arbitrarily complex
+*SymPy* expressions.
+By using the counter symbol for the first dimension to define the iteration limits of the second,
+we can produce a triangular iteration pattern:
+
+```{code-cell} ipython3
+y = ps.DEFAULTS.spatial_counters[0]
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, y:]
+)
+```
+
+:::{warning}
+This type of dependency is restricted by the ordering of the iteration space dimensions:
+The limits of a dimension can only depend on the counters of dimensions that are *slower*
+than itself.
+The ordering of dimensions is determined by the memory layout of the kernels' fields;
+see also the [section on memory layouts](#section_memory_layout).
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+##### Red-Black Iteration
+
+Using a case distinction for the second dimension's start index, we can even produce
+a checkerboard pattern, as required for e.g. red-black Gauss-Seidel-type smoothers.
+We use the integer remainder ({any}`int_rem`) to distinguish between even- and odd-numbered rows,
+set the start value accordingly using {any}`sp.Piecewise <sympy.functions.elementary.piecewise.Piecewise>`,
+and use a step size of two:
+
+$$
+  start(y)=
+    \begin{cases}
+      0 & \quad \text{if } y \; \mathrm{rem} \; 2 = 0 \\ 
+      1 & \quad \text{otherwise}
+    \end{cases}
+$$
+
+
+```{code-cell} ipython3
+from pystencils.sympyextensions.integer_functions import int_rem
+
+y = ps.DEFAULTS.spatial_counters[0]
+start = sp.Piecewise(
+    (0, sp.Eq(int_rem(y, 2), 0)),
+    (1, True)
+)
+cfg = ps.CreateKernelConfig(
+    iteration_slice=ps.make_slice[:, start::2]
+)
+```
+
+:::{warning}
+The restrictions on dimension ordering of the triangular iteration example apply
+to the checkerboard-iteration as well.
+:::
+
+```{code-cell} ipython3
+:tags: [remove-input]
+
+_show_ispace(cfg)
+```
+
+(section_memory_layout)=
+## Memory Layout and Dimension Ordering
+
+:::{admonition} Developer To Do
+Briefly explain about field memory layouts, cache locality, coalesced memory accesses (on GPU and vector CPUs),
+and the need for correct ordering of the dimensions (loop order on CPU, thread indexing order on GPU).
+:::
+
+(section_codegen_stages)=
+## Advanced: Understanding the Stages of the Code Generator
+
+While translating a set of symbolic definitions to a kernel, the code generator of pystencils
+goes through a number of stages, gradually extending and transforming the AST.
+Pystencils allows you to retrieve and inspect the intermediate results produced by the
+code generator, in order to better understand the process of kernel translation.
+This can be immensely helpful when tracking down bugs or trying to explain unexpected
+output code.
+
+To get access to the intermediate results, the code generator has to be invoked in a slightly different way.
+Instead of just calling `create_kernel`, we directly create the so-called *driver* and instruct it to
+store its intermediate ASTs:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+u_src, u_dst, f = ps.fields("u_src, u_dst, f: float32[2D]")
+h = sp.Symbol("h")
+
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX512,
+  default_dtype="float32",
+  cpu_optim=ps.CpuOptimConfig(
+    openmp=True,
+    vectorize=ps.VectorizationConfig(
+        assume_inner_stride_one=True
+    )
+  )
+)
+
+assignments = [
+  ps.Assignment(
+    u_dst[0,0], (h**2 * f[0, 0] + u_src[1, 0] + u_src[-1, 0] + u_src[0, 1] + u_src[0, -1]) / 4
+  )
+]
+```
+
+```{code-cell} ipython3
+driver = ps.kernelcreation.get_driver(cfg, retain_intermediates=True)
+kernel = driver(assignments)
+ps.inspect(driver.intermediates)
+```
+
+## API: Kernel Parameters and Function Objects
+
+```{eval-rst}
+.. module:: pystencils.backend.kernelfunction
+
+.. autosummary::
+  :toctree: generated
+  :nosignatures:
+  :template: autosummary/entire_class.rst
+
+  KernelParameter
+  KernelFunction
+  GpuKernelFunction
+```
--- a/docs/source/reference/kernelcreation.rst
+++ b/docs/source/reference/kernelcreation.rst
-.. _page_kernel_creation:
-
-***************
-Kernel Creation
-***************
-
-Targets
-=======
-
-.. module:: pystencils.target
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/recursive_class.rst
-
-    Target
-
-
-Configuration
-=============
-
-.. module:: pystencils.config
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/entire_class.rst
-
-    CreateKernelConfig
-    CpuOptimConfig
-    OpenMpConfig
-    VectorizationConfig
-    GpuIndexingConfig
-
-
-Creation
-========
-
-.. module:: pystencils.kernelcreation
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-
-    create_kernel
-
-
-Kernel Parameters and Function Objects
-======================================
-
-.. module:: pystencils.backend.kernelfunction
-
-.. autosummary::
-    :toctree: autoapi
-    :nosignatures:
-    :template: autosummary/entire_class.rst
-
-    KernelParameter
-    KernelFunction
-    GpuKernelFunction
No results found