Compare revisions

e0d50917 · e0d50917 · e0d50917 · e0d50917 · e0d50917 · e0d50917
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
+from typing import Any, Sequence, cast
+from dataclasses import dataclass
+
+try:
+    import cupy as cp
+
+    HAVE_CUPY = True
+except ImportError:
+    HAVE_CUPY = False
+
+from ..codegen import Target
+from ..field import FieldType
+
+from ..types import PsType
+from .jit import JitBase, JitError, KernelWrapper
+from ..codegen import (
+    Kernel,
+    GpuKernel,
+    Parameter,
+)
+from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
+from ..types import PsStructType, PsPointerType
+
+from ..include import get_pystencils_include_path
+
+
+@dataclass
+class LaunchGrid:
+    grid: tuple[int, int, int]
+    block: tuple[int, int, int]
+
+
+class CupyKernelWrapper(KernelWrapper):
+    def __init__(
+        self,
+        kfunc: GpuKernel,
+        raw_kernel: Any,
+        block_size: tuple[int, int, int],
+    ):
+        self._kfunc: GpuKernel = kfunc
+        self._raw_kernel = raw_kernel
+        self._block_size = block_size
+        self._num_blocks: tuple[int, int, int] | None = None
+        self._args_cache: dict[Any, tuple] = dict()
+
+    @property
+    def kernel_function(self) -> GpuKernel:
+        return self._kfunc
+
+    @property
+    def raw_kernel(self):
+        return self._raw_kernel
+
+    @property
+    def block_size(self) -> tuple[int, int, int]:
+        return self._block_size
+
+    @block_size.setter
+    def block_size(self, bs: tuple[int, int, int]):
+        self._block_size = bs
+
+    @property
+    def num_blocks(self) -> tuple[int, int, int] | None:
+        return self._num_blocks
+
+    @num_blocks.setter
+    def num_blocks(self, nb: tuple[int, int, int] | None):
+        self._num_blocks = nb
+
+    def __call__(self, **kwargs: Any):
+        kernel_args, launch_grid = self._get_cached_args(**kwargs)
+        device = self._get_device(kernel_args)
+        with cp.cuda.Device(device):
+            self._raw_kernel(launch_grid.grid, launch_grid.block, kernel_args)
+
+    def _get_device(self, kernel_args):
+        devices = set(a.device.id for a in kernel_args if type(a) is cp.ndarray)
+        if len(devices) != 1:
+            raise JitError("Could not determine CUDA device to execute on")
+        return devices.pop()
+
+    def _get_cached_args(self, **kwargs):
+        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
+
+        if key not in self._args_cache:
+            args = self._get_args(**kwargs)
+            self._args_cache[key] = args
+            return args
+        else:
+            return self._args_cache[key]
+
+    def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
+        args = []
+        valuation: dict[str, Any] = dict()
+
+        def add_arg(name: str, arg: Any, dtype: PsType):
+            nptype = dtype.numpy_dtype
+            assert nptype is not None
+            typecast = nptype.type
+            arg = typecast(arg)
+            args.append(arg)
+            valuation[name] = arg
+
+        field_shapes = set()
+        index_shapes = set()
+
+        def check_shape(field_ptr: Parameter, arr: cp.ndarray):
+            field = field_ptr.fields[0]
+
+            if field.has_fixed_shape:
+                expected_shape = tuple(int(s) for s in field.shape)
+                if isinstance(field.dtype, PsStructType):
+                    assert expected_shape[-1] == 1
+                    expected_shape = expected_shape[:-1]
+
+                actual_shape = arr.shape
+                if expected_shape != actual_shape:
+                    raise ValueError(
+                        f"Array kernel argument {field.name} had unexpected shape:\n"
+                        f"   Expected {expected_shape}, but got {actual_shape}"
+                    )
+
+                expected_strides = tuple(int(s) for s in field.strides)
+                if isinstance(field.dtype, PsStructType):
+                    assert expected_strides[-1] == 1
+                    expected_strides = expected_strides[:-1]
+
+                actual_strides = tuple(s // arr.dtype.itemsize for s in arr.strides)
+                if expected_strides != actual_strides:
+                    raise ValueError(
+                        f"Array kernel argument {field.name} had unexpected strides:\n"
+                        f"   Expected {expected_strides}, but got {actual_strides}"
+                    )
+
+            match field.field_type:
+                case FieldType.GENERIC:
+                    field_shapes.add(arr.shape[: field.spatial_dimensions])
+
+                    if len(field_shapes) > 1:
+                        raise ValueError(
+                            "Incompatible array shapes:"
+                            "All arrays passed for generic fields to a kernel must have the same shape."
+                        )
+
+                case FieldType.INDEXED:
+                    index_shapes.add(arr.shape)
+
+                    if len(index_shapes) > 1:
+                        raise ValueError(
+                            "Incompatible array shapes:"
+                            "All arrays passed for index fields to a kernel must have the same shape."
+                        )
+
+        #   Collect parameter values
+        arr: cp.ndarray
+
+        for kparam in self._kfunc.parameters:
+            if kparam.is_field_parameter:
+                #   Determine field-associated data to pass in
+                for prop in kparam.properties:
+                    match prop:
+                        case FieldBasePtr(field):
+
+                            elem_dtype: PsType
+
+                            from .. import DynamicType
+                            if isinstance(field.dtype, DynamicType):
+                                assert isinstance(kparam.dtype, PsPointerType)
+                                elem_dtype = kparam.dtype.base_type
+                            else:
+                                elem_dtype = field.dtype
+
+                            arr = kwargs[field.name]
+                            if arr.dtype != elem_dtype.numpy_dtype:
+                                raise JitError(
+                                    f"Data type mismatch at array argument {field.name}:"
+                                    f"Expected {field.dtype}, got {arr.dtype}"
+                                )
+                            check_shape(kparam, arr)
+                            args.append(arr)
+                            break
+
+                        case FieldShape(field, coord):
+                            arr = kwargs[field.name]
+                            add_arg(kparam.name, arr.shape[coord], kparam.dtype)
+                            break
+
+                        case FieldStride(field, coord):
+                            arr = kwargs[field.name]
+                            add_arg(
+                                kparam.name,
+                                arr.strides[coord] // arr.dtype.itemsize,
+                                kparam.dtype,
+                            )
+                            break
+            else:
+                #   scalar parameter
+                val: Any = kwargs[kparam.name]
+                add_arg(kparam.name, val, kparam.dtype)
+
+        #   Determine launch grid
+        from ..backend.ast.expressions import evaluate_expression
+
+        symbolic_threads_range = self._kfunc.threads_range
+
+        if self._num_blocks is not None:
+            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
+
+        elif symbolic_threads_range is not None:
+            threads_range: list[int] = [
+                evaluate_expression(expr, valuation)
+                for expr in symbolic_threads_range.num_work_items
+            ]
+
+            if symbolic_threads_range.dim < 3:
+                threads_range += [1] * (3 - symbolic_threads_range.dim)
+
+            def div_ceil(a, b):
+                return a // b if a % b == 0 else a // b + 1
+
+            #   TODO: Refine this?
+            num_blocks = tuple(
+                div_ceil(threads, tpb)
+                for threads, tpb in zip(threads_range, self._block_size)
+            )
+            assert len(num_blocks) == 3
+
+            launch_grid = LaunchGrid(num_blocks, self._block_size)
+
+        else:
+            raise JitError(
+                "Unable to determine launch grid for GPU kernel invocation: "
+                "No manual grid size was specified, and the number of threads could not "
+                "be determined automatically."
+            )
+
+        return tuple(args), launch_grid
+
+
+class CupyJit(JitBase):
+
+    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
+        self._runtime_headers = {"<cstdint>"}
+
+        if len(default_block_size) > 3:
+            raise ValueError(
+                f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
+            )
+
+        self._default_block_size: tuple[int, int, int] = cast(
+            tuple[int, int, int],
+            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
+        )
+
+    def compile(self, kfunc: Kernel) -> KernelWrapper:
+        if not HAVE_CUPY:
+            raise JitError(
+                "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
+            )
+
+        if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
+            raise ValueError(
+                "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
+            )
+
+        options = self._compiler_options()
+        prelude = self._prelude(kfunc)
+        kernel_code = self._kernel_code(kfunc)
+        code = prelude + kernel_code
+
+        raw_kernel = cp.RawKernel(
+            code, kfunc.name, options=options, backend="nvrtc", jitify=True
+        )
+        return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
+
+    def _compiler_options(self) -> tuple[str, ...]:
+        options = ["-w", "-std=c++11"]
+        options.append("-I" + get_pystencils_include_path())
+        return tuple(options)
+
+    def _prelude(self, kfunc: GpuKernel) -> str:
+        headers = self._runtime_headers
+        headers |= kfunc.required_headers
+
+        if '"half_precision.h"' in headers:
+            headers.remove('"half_precision.h"')
+            if cp.cuda.runtime.is_hip:
+                headers.add("<hip/hip_fp16.h>")
+            else:
+                headers.add("<cuda_fp16.h>")
+
+        code = "\n".join(f"#include {header}" for header in headers)
+
+        code += "\n\n#define RESTRICT __restrict__\n\n"
+
+        return code
+
+    def _kernel_code(self, kfunc: GpuKernel) -> str:
+        kernel_code = kfunc.get_c_code()
+        return f'extern "C" {kernel_code}'
--- a/src/pystencils/jit/jit.py
+++ b/src/pystencils/jit/jit.py
+from __future__ import annotations
+from typing import Sequence, TYPE_CHECKING
+from abc import ABC, abstractmethod
+
+if TYPE_CHECKING:
+    from ..codegen import Kernel, Parameter, Target
+
+
+class JitError(Exception):
+    """Indicates an error during just-in-time compilation"""
+
+
+class KernelWrapper(ABC):
+    """Wrapper around a compiled and executable pystencils kernel."""
+
+    def __init__(self, kfunc: Kernel) -> None:
+        self._kfunc = kfunc
+
+    @abstractmethod
+    def __call__(self, **kwargs) -> None:
+        pass
+
+    @property
+    def kernel_function(self) -> Kernel:
+        return self._kfunc
+    
+    @property
+    def ast(self) -> Kernel:
+        return self._kfunc
+    
+    @property
+    def target(self) -> Target:
+        return self._kfunc.target
+    
+    @property
+    def parameters(self) -> Sequence[Parameter]:
+        return self._kfunc.parameters
+
+    @property
+    def code(self) -> str:
+        from pystencils.display_utils import get_code_str
+
+        return get_code_str(self._kfunc)
+
+
+class JitBase(ABC):
+    """Base class for just-in-time compilation interfaces implemented in pystencils."""
+
+    @abstractmethod
+    def compile(self, kernel: Kernel) -> KernelWrapper:
+        """Compile a kernel function and return a callable object which invokes the kernel."""
+
+
+class NoJit(JitBase):
+    """Not a JIT compiler: Used to explicitly disable JIT compilation on an AST."""
+
+    def compile(self, kernel: Kernel) -> KernelWrapper:
+        raise JitError(
+            "Just-in-time compilation of this kernel was explicitly disabled."
+        )
--- a/src/pystencils/jit/legacy_cpu.py
+++ b/src/pystencils/jit/legacy_cpu.py
+# mypy: ignore-errors
+r"""
+
+*pystencils* automatically searches for a compiler, so in most cases no explicit configuration is required.
+On Linux make sure that 'gcc' and 'g++' are installed and in your path.
+On Windows a recent Visual Studio installation is required.
+In case anything does not work as expected or a special compiler should be used, changes can be specified
+in a configuration file.
+
+*pystencils* looks for a configuration file in JSON format at the following locations in the listed order.
+
+1. at the path specified in the environment variable ``PYSTENCILS_CONFIG``
+2. in the current working direction for a file named ``pystencils.json``
+3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
+   ``%HOMEPATH%\.pystencils\config.json`` (Windows)
+
+If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
+So run *pystencils* once, then edit the created configuration file.
+
+
+Compiler Config (Linux)
+-----------------------
+
+- **'os'**: should be detected automatically as 'linux'
+- **'command'**: path to C++ compiler (defaults to 'g++')
+- **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
+  For most Linux compilers the qualifier is ``__restrict__``
+
+
+Compiler Config (Windows)
+-------------------------
+
+*pystencils* uses the mechanism of *setuptools.msvc* to search for a compilation environment.
+Then 'cl.exe' is used to compile.
+
+- **'os'**: should be detected automatically as 'windows'
+- **'msvc_version'**:  either a version number, year number, 'auto' or 'latest' for automatic detection of latest
+  installed version or 'setuptools' for setuptools-based detection. Alternatively path to folder
+  where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
+- **'arch'**: 'x86' or 'x64'
+- **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
+  For Windows compilers the qualifier should be ``__restrict``
+
+"""
+
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
+from typing import Callable
+
+import importlib.util
+import json
+import os
+import platform
+import shutil
+import subprocess
+import sysconfig
+import tempfile
+import time
+import warnings
+
+
+from ..codegen import Kernel
+from .jit import JitBase, KernelWrapper
+from .cpu_extension_module import PsKernelExtensioNModule
+
+from .msvc_detection import get_environment
+from pystencils.include import get_pystencils_include_path
+from pystencils.utils import atomic_file_write, recursive_dict_update
+
+
+class CpuKernelWrapper(KernelWrapper):
+    def __init__(self, kfunc: Kernel, compiled_kernel: Callable[..., None]) -> None:
+        super().__init__(kfunc)
+        self._compiled_kernel = compiled_kernel
+
+    def __call__(self, **kwargs) -> None:
+        self._compiled_kernel(**kwargs)
+
+    @property
+    def kernel(self) -> Callable[..., None]:
+        return self._compiled_kernel
+
+
+class LegacyCpuJit(JitBase):
+    """Wrapper around ``pystencils.cpu.cpujit``"""
+
+    def compile(self, kernel: Kernel) -> KernelWrapper:
+        return compile_and_load(kernel)
+
+
+def make_python_function(kernel_function_node, custom_backend=None):
+    """
+    Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function
+
+    The parameters of the kernel are:
+        - numpy arrays for each field used in the kernel. The keyword argument name is the name of the field
+        - all symbols which are not defined in the kernel itself are expected as parameters
+
+    :param kernel_function_node: the abstract syntax tree
+    :param custom_backend: use own custom printer for code generation
+    :return: kernel functor
+    """
+    result = compile_and_load(kernel_function_node, custom_backend)
+    return result
+
+
+def set_config(config):
+    """
+    Override the configuration provided in config file
+
+    Configuration of compiler parameters:
+    If this function is not called the configuration is taken from a config file in JSON format which
+    is searched in the following locations in the order specified:
+        - at location provided in environment variable PYSTENCILS_CONFIG (if this variable exists)
+        - a file called ".pystencils.json" in the current working directory
+        - ~/.pystencils.json in your home
+    If none of these files exist a file ~/.pystencils.json is created with a default configuration using
+    the GNU 'g++'
+
+    An example JSON file with all possible keys. If not all keys are specified, default values are used
+    ``
+    {
+        'compiler' :
+        {
+            "command": "/software/intel/2017/bin/icpc",
+            "flags": "-Ofast -DNDEBUG -fPIC -march=native -fopenmp",
+            "env": {
+                "LM_PROJECT": "iwia",
+            }
+        }
+    }
+    ``
+    """
+    global _config
+    _config = config.copy()
+
+
+def get_configuration_file_path():
+    config_path_in_home = os.path.join(user_config_dir("pystencils"), "config.json")
+
+    # 1) Read path from environment variable if found
+    if "PYSTENCILS_CONFIG" in os.environ:
+        return os.environ["PYSTENCILS_CONFIG"], True
+    # 2) Look in current directory for pystencils.json
+    elif os.path.exists("pystencils.json"):
+        return "pystencils.json", True
+    # 3) Try ~/.pystencils.json
+    elif os.path.exists(config_path_in_home):
+        return config_path_in_home, True
+    else:
+        return config_path_in_home, False
+
+
+def create_folder(path, is_file):
+    if is_file:
+        path = os.path.split(path)[0]
+    try:
+        os.makedirs(path)
+    except os.error:
+        pass
+
+
+def read_config():
+    if platform.system().lower() == "linux":
+        default_compiler_config = OrderedDict(
+            [
+                ("os", "linux"),
+                ("command", "g++"),
+                ("flags", "-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11"),
+                ("restrict_qualifier", "__restrict__"),
+            ]
+        )
+        if platform.machine().startswith("ppc64") or platform.machine() == "arm64":
+            default_compiler_config["flags"] = default_compiler_config["flags"].replace(
+                "-march=native", "-mcpu=native"
+            )
+    elif platform.system().lower() == "windows":
+        default_compiler_config = OrderedDict(
+            [
+                ("os", "windows"),
+                ("msvc_version", "latest"),
+                ("arch", "x64"),
+                ("flags", "/Ox /fp:fast /OpenMP /arch:avx"),
+                ("restrict_qualifier", "__restrict"),
+            ]
+        )
+        if platform.machine() == "ARM64":
+            default_compiler_config["arch"] = "ARM64"
+            default_compiler_config["flags"] = default_compiler_config["flags"].replace(
+                " /arch:avx", ""
+            )
+    elif platform.system().lower() == "darwin":
+        default_compiler_config = OrderedDict(
+            [
+                ("os", "darwin"),
+                ("command", "clang++"),
+                (
+                    "flags",
+                    "-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11",
+                ),
+                ("restrict_qualifier", "__restrict__"),
+            ]
+        )
+        if platform.machine() == "arm64":
+            default_compiler_config["flags"] = default_compiler_config["flags"].replace(
+                "-march=native ", ""
+            )
+        for libomp in [
+            "/opt/local/lib/libomp/libomp.dylib",
+            "/usr/local/lib/libomp.dylib",
+            "/opt/homebrew/lib/libomp.dylib",
+        ]:
+            if os.path.exists(libomp):
+                default_compiler_config["flags"] += " " + libomp
+                break
+    else:
+        raise NotImplementedError(
+            "Generation of default compiler flags for %s is not implemented"
+            % (platform.system(),)
+        )
+
+    default_cache_config = OrderedDict(
+        [
+            ("object_cache", os.path.join(user_cache_dir("pystencils"), "objectcache")),
+            ("clear_cache_on_start", False),
+        ]
+    )
+
+    default_config = OrderedDict(
+        [("compiler", default_compiler_config), ("cache", default_cache_config)]
+    )
+
+    config_path, config_exists = get_configuration_file_path()
+    config = default_config.copy()
+    if config_exists:
+        with open(config_path, "r") as json_config_file:
+            loaded_config = json.load(json_config_file)
+        config = recursive_dict_update(config, loaded_config)
+    else:
+        create_folder(config_path, True)
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=4)
+
+    if config["cache"]["object_cache"] is not False:
+        config["cache"]["object_cache"] = os.path.expanduser(
+            config["cache"]["object_cache"]
+        ).format(pid=os.getpid())
+
+        clear_cache_on_start = False
+        cache_status_file = os.path.join(
+            config["cache"]["object_cache"], "last_config.json"
+        )
+        if os.path.exists(cache_status_file):
+            # check if compiler config has changed
+            last_config = json.load(open(cache_status_file, "r"))
+            if set(last_config.items()) != set(config["compiler"].items()):
+                clear_cache_on_start = True
+            else:
+                for key in last_config.keys():
+                    if last_config[key] != config["compiler"][key]:
+                        clear_cache_on_start = True
+
+        if config["cache"]["clear_cache_on_start"] or clear_cache_on_start:
+            shutil.rmtree(config["cache"]["object_cache"], ignore_errors=True)
+
+        create_folder(config["cache"]["object_cache"], False)
+        with tempfile.NamedTemporaryFile(
+            "w", dir=os.path.dirname(cache_status_file), delete=False
+        ) as f:
+            json.dump(config["compiler"], f, indent=4)
+        os.replace(f.name, cache_status_file)
+
+    if config["compiler"]["os"] == "windows":
+        msvc_env = get_environment(
+            config["compiler"]["msvc_version"], config["compiler"]["arch"]
+        )
+        if "env" not in config["compiler"]:
+            config["compiler"]["env"] = {}
+        config["compiler"]["env"].update(msvc_env)
+
+    return config
+
+
+_config = read_config()
+
+
+def get_compiler_config():
+    return _config["compiler"]
+
+
+def get_cache_config():
+    return _config["cache"]
+
+
+def add_or_change_compiler_flags(flags):
+    if not isinstance(flags, list) and not isinstance(flags, tuple):
+        flags = [flags]
+
+    compiler_config = get_compiler_config()
+    cache_config = get_cache_config()
+    cache_config["object_cache"] = False  # disable cache
+
+    for flag in flags:
+        flag = flag.strip()
+        if "=" in flag:
+            base = flag.split("=")[0].strip()
+        else:
+            base = flag
+
+        new_flags = [
+            c for c in compiler_config["flags"].split() if not c.startswith(base)
+        ]
+        new_flags.append(flag)
+        compiler_config["flags"] = " ".join(new_flags)
+
+
+def clear_cache():
+    cache_config = get_cache_config()
+    if cache_config["object_cache"] is not False:
+        shutil.rmtree(cache_config["object_cache"], ignore_errors=True)
+        create_folder(cache_config["object_cache"], False)
+
+
+def load_kernel_from_file(module_name, function_name, path):
+    try:
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+    except ImportError:
+        warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
+        time.sleep(5)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+
+    return getattr(mod, function_name)
+
+
+def run_compile_step(command):
+    compiler_config = get_compiler_config()
+    config_env = compiler_config["env"] if "env" in compiler_config else {}
+    compile_environment = os.environ.copy()
+    compile_environment.update(config_env)
+    try:
+        shell = True if compiler_config["os"].lower() == "windows" else False
+        subprocess.check_output(
+            command, env=compile_environment, stderr=subprocess.STDOUT, shell=shell
+        )
+    except subprocess.CalledProcessError as e:
+        print(" ".join(command))
+        print(e.output.decode("utf8"))
+        raise e
+
+
+def compile_module(code, code_hash, base_dir, compile_flags=None):
+    if compile_flags is None:
+        compile_flags = []
+
+    compiler_config = get_compiler_config()
+    extra_flags = [
+        "-I" + sysconfig.get_paths()["include"],
+        "-I" + get_pystencils_include_path(),
+    ] + compile_flags
+
+    if compiler_config["os"].lower() == "windows":
+        lib_suffix = ".pyd"
+        object_suffix = ".obj"
+        windows = True
+    else:
+        lib_suffix = ".so"
+        object_suffix = ".o"
+        windows = False
+
+    src_file = os.path.join(base_dir, code_hash + ".cpp")
+    lib_file = os.path.join(base_dir, code_hash + lib_suffix)
+    object_file = os.path.join(base_dir, code_hash + object_suffix)
+
+    if not os.path.exists(object_file):
+        try:
+            with open(src_file, "x") as f:
+                code.write_to_file(f)
+        except FileExistsError:
+            pass
+
+        if windows:
+            compile_cmd = ["cl.exe", "/c", "/EHsc"] + compiler_config["flags"].split()
+            compile_cmd += [*extra_flags, src_file, "/Fo" + object_file]
+            run_compile_step(compile_cmd)
+        else:
+            with atomic_file_write(object_file) as file_name:
+                compile_cmd = [compiler_config["command"], "-c"] + compiler_config[
+                    "flags"
+                ].split()
+                compile_cmd += [*extra_flags, "-o", file_name, src_file]
+                run_compile_step(compile_cmd)
+
+        # Linking
+        if windows:
+            config_vars = sysconfig.get_config_vars()
+            py_lib = os.path.join(
+                config_vars["installed_base"],
+                "libs",
+                f"python{config_vars['py_version_nodot']}.lib",
+            )
+            run_compile_step(
+                ["link.exe", py_lib, "/DLL", "/out:" + lib_file, object_file]
+            )
+        elif platform.system().lower() == "darwin":
+            with atomic_file_write(lib_file) as file_name:
+                run_compile_step(
+                    [
+                        compiler_config["command"],
+                        "-shared",
+                        object_file,
+                        "-o",
+                        file_name,
+                        "-undefined",
+                        "dynamic_lookup",
+                    ]
+                    + compiler_config["flags"].split()
+                )
+        else:
+            with atomic_file_write(lib_file) as file_name:
+                run_compile_step(
+                    [
+                        compiler_config["command"],
+                        "-shared",
+                        object_file,
+                        "-o",
+                        file_name,
+                    ]
+                    + compiler_config["flags"].split()
+                )
+    return lib_file
+
+
+def compile_and_load(kernel: Kernel, custom_backend=None):
+    cache_config = get_cache_config()
+
+    compiler_config = get_compiler_config()
+    function_prefix = (
+        "__declspec(dllexport)" if compiler_config["os"].lower() == "windows" else ""
+    )
+
+    code = PsKernelExtensioNModule()
+
+    code.add_function(kernel, kernel.name)
+
+    code.create_code_string(compiler_config["restrict_qualifier"], function_prefix)
+    code_hash_str = code.get_hash_of_code()
+
+    compile_flags = []
+    #   TODO: replace
+    # if kernel.instruction_set and "compile_flags" in kernel.instruction_set:
+    #     compile_flags = kernel.instruction_set["compile_flags"]
+
+    if cache_config["object_cache"] is False:
+        with tempfile.TemporaryDirectory() as base_dir:
+            lib_file = compile_module(
+                code, code_hash_str, base_dir, compile_flags=compile_flags
+            )
+            result = load_kernel_from_file(code_hash_str, kernel.name, lib_file)
+    else:
+        lib_file = compile_module(
+            code,
+            code_hash_str,
+            base_dir=cache_config["object_cache"],
+            compile_flags=compile_flags,
+        )
+        result = load_kernel_from_file(code_hash_str, kernel.name, lib_file)
+
+    return CpuKernelWrapper(kernel, result)
--- a/src/pystencils/jit/msvc_detection.py
+++ b/src/pystencils/jit/msvc_detection.py
+import os
+import subprocess
+
+
+def get_environment(version_specifier, arch="x64"):
+    """Returns an environment dictionary, for activating the Visual Studio compiler.
+
+    Args:
+        version_specifier: either a version number, year number, 'auto' or 'latest' for automatic detection of latest
+                          installed version or 'setuptools' for setuptools-based detection
+        arch: x86 or x64
+    """
+    if version_specifier == "setuptools":
+        return get_environment_from_setup_tools(arch)
+    elif "\\" in version_specifier:
+        vc_vars_path = find_vc_vars_all_via_filesystem_search(version_specifier)
+        return get_environment_from_vc_vars_file(vc_vars_path, arch)
+    else:
+        try:
+            if version_specifier in ("auto", "latest"):
+                version_nr = find_latest_msvc_version_using_environment_variables()
+            else:
+                version_nr = normalize_msvc_version(version_specifier)
+            vc_vars_path = get_vc_vars_path_via_environment_variable(version_nr)
+        except ValueError:
+            vc_vars_path = find_vc_vars_all_via_filesystem_search(
+                "C:\\Program Files (x86)\\Microsoft Visual Studio"
+            )
+            if vc_vars_path is None:
+                vc_vars_path = find_vc_vars_all_via_filesystem_search(
+                    "C:\\Program Files\\Microsoft Visual Studio"
+                )
+            if vc_vars_path is None:
+                raise ValueError(
+                    "Visual Studio not found. Write path to VS folder in pystencils config"
+                )
+
+        return get_environment_from_vc_vars_file(vc_vars_path, arch)
+
+
+def find_latest_msvc_version_using_environment_variables():
+    import re
+
+    # noinspection SpellCheckingInspection
+    regex = re.compile(r"VS(\d\d)\dCOMNTOOLS")
+    versions = []
+    for key, value in os.environ.items():
+        match = regex.match(key)
+        if match:
+            versions.append(int(match.group(1)))
+    if len(versions) == 0:
+        raise ValueError("Visual Studio not found.")
+    versions.sort()
+    return versions[-1]
+
+
+def normalize_msvc_version(version):
+    """
+    Takes version specifiers in the following form:
+        - year: 2012, 2013, 2015, either as int or string
+        - version numbers with or without dot i.e. 11.0 or 11
+    :return: integer version number
+    """
+    if isinstance(version, str) and "." in version:
+        version = version.split(".")[0]
+
+    version = int(version)
+    mapping = {2015: 14, 2013: 12, 2012: 11}
+    if version in mapping:
+        return mapping[version]
+    else:
+        return version
+
+
+def get_environment_from_vc_vars_file(vc_vars_file, arch):
+    out = subprocess.check_output(
+        f'cmd /u /c "{vc_vars_file}" {arch} && set',
+        stderr=subprocess.STDOUT,
+    ).decode("utf-16le", errors="replace")
+
+    env = {
+        key.upper(): value
+        for key, _, value in (line.partition("=") for line in out.splitlines())
+        if key and value
+    }
+    return env
+
+
+def get_vc_vars_path_via_environment_variable(version_nr):
+    # noinspection SpellCheckingInspection
+    environment_var_name = "VS%d0COMNTOOLS" % (version_nr,)
+    vc_path = os.environ[environment_var_name]
+    path = os.path.join(vc_path, "..", "..", "VC", "vcvarsall.bat")
+    return os.path.abspath(path)
+
+
+def get_environment_from_setup_tools(arch):
+    from setuptools.msvc import msvc14_get_vc_env
+
+    msvc_env = msvc14_get_vc_env(arch)
+    return {k.upper(): v for k, v in msvc_env.items()}
+
+
+def find_vc_vars_all_via_filesystem_search(base_path):
+    matches = []
+    for root, dir_names, file_names in os.walk(base_path):
+        for filename in file_names:
+            if filename == "vcvarsall.bat":
+                matches.append(os.path.join(root, filename))
+
+    matches.sort(reverse=True)
+    if matches:
+        return matches[0]
--- a/src/pystencils/kernel_decorator.py
+++ b/src/pystencils/kernel_decorator.py
@@ -5,9 +5,9 @@ from typing import Callable, Union, List, Dict, Tuple

 import sympy as sp

-from pystencils.assignment import Assignment
-from pystencils.sympyextensions import SymbolCreator
-from pystencils.config import CreateKernelConfig
+from .assignment import Assignment
+from .sympyextensions import SymbolCreator
+from .codegen import CreateKernelConfig

 __all__ = ['kernel', 'kernel_config']


--- a/src/pystencils/kernel_wrapper.py
+++ b/src/pystencils/kernel_wrapper.py
-import pystencils
+from .jit import KernelWrapper as _KernelWrapper

-
-class KernelWrapper:
-    """
-    Light-weight wrapper around a compiled kernel.
-
-    Can be called while still providing access to underlying AST.
-    """
-
-    def __init__(self, kernel, parameters, ast_node: pystencils.astnodes.KernelFunction):
-        self.kernel = kernel
-        self.parameters = parameters
-        self.ast = ast_node
-        self.num_regs = None
-
-    def __call__(self, **kwargs):
-        return self.kernel(**kwargs)
-
-    @property
-    def code(self):
-        return pystencils.get_code_str(self.ast)
+KernelWrapper = _KernelWrapper
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
-import itertools
-import warnings
-from typing import Union, List
+from .codegen import Target
+from .codegen import create_kernel as _create_kernel

-import sympy as sp
-from pystencils.config import CreateKernelConfig
+from warnings import warn

-from pystencils.assignment import Assignment, AddAugmentedAssignment
-from pystencils.astnodes import Node, Block, Conditional, LoopOverCoordinate, SympyAssignment
-from pystencils.cpu.vectorization import vectorize
-from pystencils.enums import Target, Backend
-from pystencils.field import Field, FieldType
-from pystencils.node_collection import NodeCollection
-from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.kernel_contrains_check import KernelConstraintsCheck
-from pystencils.simplificationfactory import create_simplification_strategy
-from pystencils.stencil import direction_string_to_offset, inverse_direction_string
-from pystencils.transformations import (
-    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
+warn(
+    "Importing anything from `pystencils.kernelcreation` is deprecated "
+    "and the module will be removed in pystencils 2.1. "
+    "Import from `pystencils` instead.",
+    FutureWarning,
+)


-def create_kernel(assignments: Union[Assignment, List[Assignment],
-                                     AddAugmentedAssignment, List[AddAugmentedAssignment],
-                                     AssignmentCollection, List[Node], NodeCollection],
-                  *,
-                  config: CreateKernelConfig = None, **kwargs):
-    """
-    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
-    This function forms the general API and delegates the kernel creation to others depending on the CreateKernelConfig.
-    Args:
-        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
-        config: CreateKernelConfig which includes the needed configuration
-        kwargs: Arguments for updating the config
+create_kernel = _create_kernel

-    Returns:
-        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
-        can be compiled with through its 'compile()' member

-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
-        >>> kernel_ast = ps.create_kernel(assignment, config=ps.CreateKernelConfig(cpu_openmp=True))
-        >>> kernel = kernel_ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(d=d_arr, s=np.ones([5, 5]))
-        >>> d_arr
-        array([[0., 0., 0., 0., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 0., 0., 0., 0.]])
-    """
-    # ----  Updating configuration from kwargs
-    if not config:
-        config = CreateKernelConfig(**kwargs)
-    else:
-        for k, v in kwargs.items():
-            if not hasattr(config, k):
-                raise KeyError(f'{v} is not a valid kwarg. Please look in CreateKernelConfig for valid settings')
-            setattr(config, k, v)
-
-    # ----  Normalizing parameters
-    if isinstance(assignments, (Assignment, AddAugmentedAssignment)):
-        assignments = [assignments]
-    assert assignments, "Assignments must not be empty!"
-    if isinstance(assignments, list):
-        assignments = NodeCollection(assignments)
-    elif isinstance(assignments, AssignmentCollection):
-        # TODO Markus check and doku
-        # --- applying first default simplifications
-        try:
-            if config.default_assignment_simplifications:
-                simplification = create_simplification_strategy()
-                assignments = simplification(assignments)
-        except Exception as e:
-            warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
-                          f"AssignmentCollection due to the following problem :{e}")
-        simplification_hints = assignments.simplification_hints
-        assignments = NodeCollection.from_assignment_collection(assignments)
-        assignments.simplification_hints = simplification_hints
-
-    if config.index_fields:
-        return create_indexed_kernel(assignments, config=config)
-    else:
-        return create_domain_kernel(assignments, config=config)
-
-
-def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
-    """
-    Creates abstract syntax tree (AST) of kernel, using a NodeCollection.
-
-    Note that `create_domain_kernel` is a lower level function which shoul be accessed by not providing `index_fields`
-    to create_kernel
-
-    Args:
-        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
-        config: CreateKernelConfig which includes the needed configuration
-
-    Returns:
-        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
-        can be compiled with through its 'compile()' member
-
-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>> from pystencils.kernelcreation import create_domain_kernel
-        >>> from pystencils.node_collection import NodeCollection
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
-        >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True)
-        >>> kernel_ast = create_domain_kernel(NodeCollection([assignment]), config=kernel_config)
-        >>> kernel = kernel_ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(d=d_arr, s=np.ones([5, 5]))
-        >>> d_arr
-        array([[0., 0., 0., 0., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 0., 0., 0., 0.]])
-    """
-    # --- eval
-    assignments.evaluate_terms()
-
-    # FUTURE WORK from here we shouldn't NEED sympy
-    # --- check constrains
-    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
-                                   check_double_write_condition=not config.allow_double_writes)
-    check.visit(assignments)
-
-    assignments.bound_fields = check.fields_written
-    assignments.rhs_fields = check.fields_read
-
-    # ----  Creating ast
-    ast = None
-    if config.target == Target.CPU:
-        if config.backend == Backend.C:
-            from pystencils.cpu import add_openmp, create_kernel
-            ast = create_kernel(assignments, config=config)
-            for optimization in config.cpu_prepend_optimizations:
-                optimization(ast)
-            omp_collapse = None
-            if config.cpu_blocking:
-                omp_collapse = loop_blocking(ast, config.cpu_blocking)
-            if config.cpu_openmp:
-                add_openmp(ast, num_threads=config.cpu_openmp, collapse=omp_collapse,
-                           assume_single_outer_loop=config.omp_single_loop)
-            if config.cpu_vectorize_info:
-                if config.cpu_vectorize_info is True:
-                    vectorize(ast)
-                elif isinstance(config.cpu_vectorize_info, dict):
-                    vectorize(ast, **config.cpu_vectorize_info)
-                    if config.cpu_openmp and config.cpu_blocking and 'nontemporal' in config.cpu_vectorize_info and \
-                            config.cpu_vectorize_info['nontemporal'] and 'cachelineZero' in ast.instruction_set:
-                        # This condition is stricter than it needs to be: if blocks along the fastest axis start on a
-                        # cache line boundary, it's okay. But we cannot determine that here.
-                        # We don't need to disallow OpenMP collapsing because it is never applied to the inner loop.
-                        raise ValueError("Blocking cannot be combined with cacheline-zeroing")
-                else:
-                    raise ValueError("Invalid value for cpu_vectorize_info")
-    elif config.target == Target.GPU:
-        if config.backend == Backend.CUDA:
-            from pystencils.gpu import create_cuda_kernel
-            ast = create_cuda_kernel(assignments, config=config)
-
-    if not ast:
-        raise NotImplementedError(
-            f'{config.target} together with {config.backend} is not supported by `create_domain_kernel`')
-
-    if config.use_auto_for_assignments:
-        for a in ast.atoms(SympyAssignment):
-            a.use_auto = True
-
-    return ast
-
-
-def create_indexed_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
-    """
-    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
-    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
-
-    The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
-    This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
-    'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
-    example boundary parameters.
-
-    Note that `create_indexed_kernel` is a lower level function which shoul be accessed by providing `index_fields`
-    to create_kernel
-
-    Args:
-        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
-        config: CreateKernelConfig which includes the needed configuration
-
-    Returns:
-        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
-        can be compiled with through its 'compile()' member
-
-    Example:
-        >>> import pystencils as ps
-        >>> from pystencils.node_collection import NodeCollection
-        >>> import numpy as np
-        >>> from pystencils.kernelcreation import create_indexed_kernel
-        >>>
-        >>> # Index field stores the indices of the cell to visit together with optional values
-        >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
-        >>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
-        >>> idx_field = ps.fields(idx=index_arr)
-        >>>
-        >>> # Additional values  stored in index field can be accessed in the kernel as well
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
-        >>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y'))
-        >>> kernel_ast = create_indexed_kernel(NodeCollection([assignment]), config=kernel_config)
-        >>> kernel = kernel_ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
-        >>> d_arr
-        array([[0. , 0. , 0. , 0. , 0. ],
-               [0. , 4.1, 0. , 0. , 0. ],
-               [0. , 0. , 4.2, 0. , 0. ],
-               [0. , 0. , 0. , 4.3, 0. ],
-               [0. , 0. , 0. , 0. , 0. ]])
-
-    """
-    # --- eval
-    assignments.evaluate_terms()
-
-    # FUTURE WORK from here we shouldn't NEED sympy
-    # --- check constrains
-    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
-                                   check_double_write_condition=not config.allow_double_writes)
-    check.visit(assignments)
-
-    assignments.bound_fields = check.fields_written
-    assignments.rhs_fields = check.fields_read
-
-    ast = None
-    if config.target == Target.CPU and config.backend == Backend.C:
-        from pystencils.cpu import add_openmp, create_indexed_kernel
-        ast = create_indexed_kernel(assignments, config=config)
-        if config.cpu_openmp:
-            add_openmp(ast, num_threads=config.cpu_openmp)
-    elif config.target == Target.GPU:
-        if config.backend == Backend.CUDA:
-            from pystencils.gpu import created_indexed_cuda_kernel
-            ast = created_indexed_cuda_kernel(assignments, config=config)
-
-    if not ast:
-        raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
-    return ast
-
-
-def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs):
-    """Kernel that updates a staggered field.
-
-    .. image:: /img/staggered_grid.svg
-
-    For a staggered field, the first index coordinate defines the location of the staggered value.
-    Further index coordinates can be used to store vectors/tensors at each point.
-
-    Args:
-        assignments: a sequence of assignments or an AssignmentCollection.
-                     Assignments to staggered field are processed specially, while subexpressions and assignments to
-                     regular fields are passed through to `create_kernel`. Multiple different staggered fields can be
-                     used, but they all need to use the same stencil (i.e. the same number of staggered points) and
-                     shape.
-        target: 'CPU' or 'GPU'
-        gpu_exclusive_conditions: disable the use of multiple conditionals inside the loop. The outer layers are then
-                                  handled in an else branch.
-        kwargs: passed directly to create_kernel, iteration_slice and ghost_layers parameters are not allowed
-
-    Returns:
-        AST, see `create_kernel`
-    """
-    # TODO: Add doku like in the other kernels
-    if 'ghost_layers' in kwargs:
-        assert kwargs['ghost_layers'] is None
-        del kwargs['ghost_layers']
-    if 'iteration_slice' in kwargs:
-        assert kwargs['iteration_slice'] is None
-        del kwargs['iteration_slice']
-    if 'omp_single_loop' in kwargs:
-        assert kwargs['omp_single_loop'] is False
-        del kwargs['omp_single_loop']
-
-    if isinstance(assignments, AssignmentCollection):
-        subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments
-                                                       if not hasattr(a, 'lhs')
-                                                       or type(a.lhs) is not Field.Access
-                                                       or not FieldType.is_staggered(a.lhs.field)]
-        assignments = [a for a in assignments.main_assignments if hasattr(a, 'lhs')
-                       and type(a.lhs) is Field.Access
-                       and FieldType.is_staggered(a.lhs.field)]
-    else:
-        subexpressions = [a for a in assignments if not hasattr(a, 'lhs')
-                          or type(a.lhs) is not Field.Access
-                          or not FieldType.is_staggered(a.lhs.field)]
-        assignments = [a for a in assignments if hasattr(a, 'lhs')
-                       and type(a.lhs) is Field.Access
-                       and FieldType.is_staggered(a.lhs.field)]
-    if len(set([tuple(a.lhs.field.staggered_stencil) for a in assignments])) != 1:
-        raise ValueError("All assignments need to be made to staggered fields with the same stencil")
-    if len(set([a.lhs.field.shape for a in assignments])) != 1:
-        raise ValueError("All assignments need to be made to staggered fields with the same shape")
-
-    staggered_field = assignments[0].lhs.field
-    stencil = staggered_field.staggered_stencil
-    dim = staggered_field.spatial_dimensions
-    shape = staggered_field.shape
-
-    counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
-
-    final_assignments = []
-
-    # find out whether any of the ghost layers is not needed
-    common_exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
-    for direction in stencil:
-        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
-        for elementary_direction in direction:
-            exclusions.remove(inverse_direction_string(elementary_direction))
-        common_exclusions.intersection_update(exclusions)
-    ghost_layers = [[0, 0] for d in range(dim)]
-    for direction in common_exclusions:
-        direction = direction_string_to_offset(direction)
-        for d, s in enumerate(direction):
-            if s == 1:
-                ghost_layers[d][1] = 1
-            elif s == -1:
-                ghost_layers[d][0] = 1
-
-    def condition(direction):
-        """exclude those staggered points that correspond to fluxes between ghost cells"""
-        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
-
-        for elementary_direction in direction:
-            exclusions.remove(inverse_direction_string(elementary_direction))
-        conditions = []
-        for e in exclusions:
-            if e in common_exclusions:
-                continue
-            offset = direction_string_to_offset(e)
-            for i, o in enumerate(offset):
-                if o == 1:
-                    conditions.append(counters[i] < shape[i] - 1)
-                elif o == -1:
-                    conditions.append(counters[i] > 0)
-        return sp.And(*conditions)
-
-    if gpu_exclusive_conditions:
-        outer_assignment = None
-        conditions = {direction: condition(direction) for direction in stencil}
-        for num_conditions in range(len(stencil)):
-            for combination in itertools.combinations(conditions.values(), num_conditions):
-                for assignment in assignments:
-                    direction = stencil[assignment.lhs.index[0]]
-                    if conditions[direction] in combination:
-                        assignment = SympyAssignment(assignment.lhs, assignment.rhs)
-                        outer_assignment = Conditional(sp.And(*combination), Block([assignment]), outer_assignment)
-
-        inner_assignment = []
-        for assignment in assignments:
-            inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs))
-        last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]),
-                                       Block(inner_assignment), outer_assignment)
-        final_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
-                            [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
-                            [last_conditional]
-
-        config = CreateKernelConfig(target=target, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
-        ast = create_kernel(final_assignments, config=config)
-        return ast
-
-    for assignment in assignments:
-        direction = stencil[assignment.lhs.index[0]]
-        sp_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
-                         [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
-                         [SympyAssignment(assignment.lhs, assignment.rhs)]
-        last_conditional = Conditional(condition(direction), Block(sp_assignments))
-        final_assignments.append(last_conditional)
-
-    remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers])
-    prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional),
-                             move_constants_before_loop]
-    if 'cpu_prepend_optimizations' in kwargs:
-        prepend_optimizations += kwargs['cpu_prepend_optimizations']
-        del kwargs['cpu_prepend_optimizations']
-
-    config = CreateKernelConfig(ghost_layers=ghost_layers, target=target, omp_single_loop=False,
-                                cpu_prepend_optimizations=prepend_optimizations, **kwargs)
-    ast = create_kernel(final_assignments, config=config)
-    return ast
+def create_staggered_kernel(
+    assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs
+):
+    raise NotImplementedError(
+        "Staggered kernels are not yet implemented for pystencils 2.0"
+    )
--- a/src/pystencils/backends/__init__.py
+++ b/src/pystencils/backends/__init__.py
--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
--- a/src/pystencils/backends/cbackend.py
+++ b/src/pystencils/backends/cbackend.py
@@ -11,7 +11,7 @@ from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
 from sympy.functions.elementary.hyperbolic import HyperbolicFunction

-from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
+from pystencils.sympyextensions.astnodes import KernelFunction, LoopOverCoordinate, Node
 from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
 from pystencils.typing import (
    PointerType, VectorType, CastFunc, create_type, get_type_of_expression,

--- a/src/pystencils/backends/cuda_backend.py
+++ b/src/pystencils/backends/cuda_backend.py
-from pystencils.astnodes import Node
+from pystencils.sympyextensions.astnodes import Node
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
 from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt

--- a/src/pystencils/backends/dot.py
+++ b/src/pystencils/backends/dot.py
@@ -55,7 +55,7 @@ class DotPrinter(Printer):


 def __shortened(node):
-    from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Conditional
+    from pystencils.sympyextensions.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Conditional
    if isinstance(node, LoopOverCoordinate):
        return "Loop over dim %d" % (node.coordinate_to_loop_over,)
    elif isinstance(node, KernelFunction):

--- a/src/pystencils/backends/json.py
+++ b/src/pystencils/backends/json.py
@@ -9,7 +9,7 @@
 """
 import json

-from pystencils.astnodes import NodeOrExpr
+from pystencils.sympyextensions.astnodes import NodeOrExpr
 from pystencils.backends.cbackend import CustomSympyPrinter, generate_c

 try:

--- a/src/pystencils/backends/ppc_instruction_sets.py
+++ b/src/pystencils/backends/ppc_instruction_sets.py
--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
@@ -99,7 +99,7 @@ def get_cacheline_size(instruction_set):
        return None
    
    import pystencils as ps
-    from pystencils.astnodes import SympyAssignment
+    from pystencils.sympyextensions.astnodes import SympyAssignment
    import numpy as np
    from pystencils.cpu.vectorization import CachelineSize
    

--- a/src/pystencils/backends/x86_instruction_sets.py
+++ b/src/pystencils/backends/x86_instruction_sets.py
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -5,8 +5,7 @@ from types import MappingProxyType
 from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable

 from pystencils import Target, Backend, Field
-from pystencils.typing.typed_sympy import BasicType
-from pystencils.typing.utilities import collate_types
+from ..sympyextensions.typed_sympy import BasicType

 import numpy as np


--- a/src/pystencils/cpu/__init__.py
+++ b/src/pystencils/cpu/__init__.py
-from pystencils.cpu.cpujit import make_python_function
-from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
+from .cpujit import make_python_function
+from .kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas

 __all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
--- a/src/pystencils/cpu/cpujit.py
+++ b/src/pystencils/cpu/cpujit.py
@@ -61,7 +61,7 @@ import warnings
 import numpy as np

 from pystencils import FieldType
-from pystencils.astnodes import LoopOverCoordinate
+from pystencils.sympyextensions.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers, CFunction
 from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
@@ -619,7 +619,13 @@ def compile_and_load(ast, custom_backend=None):
    compiler_config = get_compiler_config()
    function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else ''

-    code = ExtensionModuleCode(custom_backend=custom_backend)
+    from ..nbackend.ast import PsKernelFunction
+    if isinstance(ast, PsKernelFunction):
+        from ..nbackend.jit.cpu_extension_module import PsKernelExtensioNModule
+        code = PsKernelExtensioNModule()
+    else:
+        code = ExtensionModuleCode(custom_backend=custom_backend)
+    
    code.add_function(ast, ast.function_name)

    code.create_code_string(compiler_config['restrict_qualifier'], function_prefix)
No results found