add CPU optimizer config. Extend various doc comments.

103553f4 · Frederik Hennig · ab47a5a4 · 103553f4 · 103553f4 · 103553f4
Commit 103553f4 authored 1 year ago by Frederik Hennig
--- a/src/pystencils/backend/ast/astnode.py
+++ b/src/pystencils/backend/ast/astnode.py
@@ -31,10 +31,15 @@ class PsAstNode(ABC):

    @abstractmethod
    def clone(self) -> PsAstNode:
+        """Perform a deep copy of the AST."""
        pass

    def structurally_equal(self, other: PsAstNode) -> bool:
-        """Check two ASTs for structural equality."""
+        """Check two ASTs for structural equality.
+
+        By default this method checks the node's type and children.
+        If an AST node has additional internal state, it MUST override this method.
+        """
        return (
            (type(self) is type(other))
            and len(self.children) == len(other.children)

--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -41,25 +41,20 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))

 class KernelCreationContext:
    """Manages the translation process from the SymPy frontend to the backend AST, and collects
-    all necessary information for the translation.
+    all necessary information for the translation:
+
+     - *Data Types*: The kernel creation context manages the default data types for loop limits
+       and counters, index calculations, and the typifier.
+     - *Symbols*: The context maintains a symbol table, keeping track of all symbols encountered
+       during kernel translation together with their types.
+     - *Fields and Arrays*: The context collects all fields encountered during code generation,
+       applies a few consistency checks to them, and manages their associated arrays.
+     - *Iteration Space*: The context manages the iteration space of the kernel currently being
+       translated.
+     - *Constraints*: The context collects all kernel parameter constraints introduced during the
+       translation process.
+     - *Required Headers*: The context collects all header files required for the kernel to run.

-
-    Data Types
-    ----------
-
-    The kernel creation context manages the default data types for loop limits and counters, index calculations,
-    and the typifier.
-
-    Fields and Arrays
-    ------------------
-
-    The kernel creation context acts as a factory for mapping fields to arrays.
-
-    Iteration Space
-    ---------------
-
-    The context manages the iteration space within which the current translation takes place. It may be a sparse
-    or full iteration space.
    """

    def __init__(

--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -54,6 +54,7 @@ class FreezeExpressions:
     - Augmented Assignments
     - AddressOf
     - Conditionals (+ frontend class)
+     - Relations (sp.Relational)
     - pystencils.integer_functions
     - pystencils.sympyextensions.bit_masks
     - GPU fast approximations (pystencils.fast_approximation)

--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -132,17 +132,21 @@ class KernelFunction:

    @property
    def target(self) -> Target:
-        """See pystencils.Target"""
        return self._target

    @property
    def name(self) -> str:
        return self._name
-
+    
    @name.setter
    def name(self, n: str):
        self._name = n

+    @property
+    def function_name(self) -> str:
+        """For backward compatibility"""
+        return self._name
+
    @property
    def parameters(self) -> tuple[KernelParameter, ...]:
        return self._params

--- a/src/pystencils/backend/symbols.py
+++ b/src/pystencils/backend/symbols.py
@@ -5,7 +5,7 @@ from .exceptions import PsInternalCompilerError
 class PsSymbol:
    """A mutable symbol with name and data type.

-    Be advised to not create objects of this class directly unless you know what you are doing;
+    Do not create objects of this class directly unless you know what you are doing;
    instead obtain them from a `KernelCreationContext` through `KernelCreationContext.get_symbol`.
    This way, the context can keep track of all symbols used in the translation run,
    and uniqueness of symbols is ensured.
@@ -50,4 +50,4 @@ class PsSymbol:
        return f"{self._name}: {dtype_str}"

    def __repr__(self) -> str:
-        return str(self)
+        return f"PsSymbol({self._name}, {self._dtype})"
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
+from __future__ import annotations
+
+from collections.abc import Collection
+
 from typing import Sequence
 from dataclasses import dataclass

@@ -11,6 +15,90 @@ from .types import PsIntegerType, PsNumericType, PsIeeeFloatType
 from .defaults import DEFAULTS


+@dataclass
+class CpuOptimConfig:
+    """Configuration for the CPU optimizer.
+    
+    If any flag in this configuration is set to a value not supported by the CPU specified
+    in `CreateKernelConfig.target`, an error will be raised.
+    """
+    
+    openmp: bool = False
+    """Enable OpenMP parallelization.
+    
+    If set to `True`, the kernel will be parallelized using OpenMP according to the OpenMP settings
+    given in this configuration.
+    """
+
+    vectorize: bool | VectorizationConfig = False
+    """Enable and configure auto-vectorization.
+    
+    If set to an instance of `VectorizationConfig` and a CPU target with vector capabilities is selected,
+    pystencils will attempt to vectorize the kernel according to the given vectorization options.
+
+    If set to `True`, pystencils will infer vectorization options from the given CPU target.
+
+    If set to `False`, no vectorization takes place.
+    """
+
+    loop_blocking: None | tuple[int, ...] = None
+    """Block sizes for loop blocking.
+    
+    If set, the kernel's loops will be tiled according to the given block sizes.
+    """
+
+    use_cacheline_zeroing: bool = False
+    """Enable cache-line zeroing.
+    
+    If set to `True` and the selected CPU supports cacheline zeroing, the CPU optimizer will attempt
+    to produce cacheline zeroing instructions where possible.
+    """
+
+
+@dataclass
+class VectorizationConfig:
+    """Configuration for the auto-vectorizer.
+    
+    If any flag in this configuration is set to a value not supported by the CPU specified
+    in `CreateKernelConfig.target`, an error will be raised.
+    """
+
+    vector_width: int | None = None
+    """Desired vector register width in bits.
+    
+    If set to an integer value, the vectorizer will use this as the desired vector register width.
+
+    If set to `None`, the vector register width will be automatically set to the broadest possible.
+    
+    If the selected CPU does not support the given width, an error will be raised.
+    """
+
+    use_nontemporal_stores: bool | Collection[str | Field] = False
+    """Enable nontemporal (streaming) stores.
+    
+    If set to `True` and the selected CPU supports streaming stores, the vectorizer will generate
+    nontemporal store instructions for all stores.
+
+    If set to a collection of fields (or field names), streaming stores will only be generated for
+    the given fields.
+    """
+
+    assume_aligned: bool = False
+    """Assume field pointer alignment.
+    
+    If set to `True`, the vectorizer will assume that the address of the first inner entry
+    (after ghost layers) of each field is aligned at the necessary byte boundary.
+    """
+
+    assume_inner_stride_one: bool = False
+    """Assume stride associated with the innermost spatial coordinate of all fields is one.
+    
+    If set to `True`, the vectorizer will replace the stride of the innermost spatial coordinate
+    with unity, thus enabling vectorization. If any fields already have a fixed innermost stride
+    that is not equal to one, an error will be raised.
+    """
+
+
 @dataclass
 class CreateKernelConfig:
    """Options for create_kernel."""
@@ -67,6 +155,12 @@ class CreateKernelConfig:
    This data type will be applied to all untyped symbols.
    """

+    cpu_optim: None | CpuOptimConfig = None
+    """Configuration of the CPU kernel optimizer.
+    
+    If this parameter is set while `target` is a non-CPU target, an error will be raised.
+    """
+
    def __post_init__(self):
        #   Check iteration space argument consistency
        if (
@@ -88,6 +182,14 @@ class CreateKernelConfig:
            raise PsOptionsError(
                "Only fields with `field_type == FieldType.INDEXED` can be specified as `index_field`"
            )
+        
+        #   Check optim
+        if self.cpu_optim is not None:
+            if not self.target.is_cpu():
+                raise PsOptionsError(f"`cpu_optim` cannot be set for non-CPU target {self.target}")
+            
+            if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu():
+                raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}")

        #   Infer JIT
        if self.jit is None:

--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -37,7 +37,8 @@ def create_kernel(
    assignments: AssignmentCollection | list[Assignment] | Assignment,
    config: CreateKernelConfig = CreateKernelConfig(),
 ):
-    """Create a kernel AST from an assignment collection."""
+    """Create a kernel function from an assignment collection."""
+
    ctx = KernelCreationContext(
        default_dtype=config.default_dtype, index_dtype=config.index_dtype
    )