From a898a3baec6386e83a775b1bec73a5f18cd60110 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 17 Jan 2025 19:27:01 +0100 Subject: [PATCH 1/7] config descriptors --- src/pystencils/codegen/config.py | 131 ++++++++++++++++++++++++++++++- tests/codegen/test_config.py | 57 ++++++++++++++ 2 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 tests/codegen/test_config.py diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 3a7647907..83602dbbc 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -2,10 +2,12 @@ from __future__ import annotations from typing import TYPE_CHECKING from warnings import warn +from abc import ABC from collections.abc import Collection +from copy import copy -from typing import Sequence -from dataclasses import dataclass, InitVar, replace +from typing import Sequence, Generic, TypeVar, Callable, Any, cast +from dataclasses import dataclass, InitVar, replace, fields from .target import Target from ..field import Field, FieldType @@ -28,6 +30,131 @@ class PsOptionsError(Exception): """Indicates an option clash in the `CreateKernelConfig`.""" +Option_T = TypeVar("Option_T") +Arg_T = TypeVar("Arg_T") + + +class Option(Generic[Option_T, Arg_T]): + """Option descriptor. + + This descriptor is used to model configuration options. + It maintains a default value for the option that is used when no value + was specified by the user. + + In configuration options, the value `None` stands for `unset`. + It can therefore not be used to set an option to the meaning "not any", or "empty" + - for these, special values need to be used. + + The Option allows a validator function to be specified, + which will be called to perform sanity checks on user-provided values. + + Through the validator, options may also be set from arguments of a different type (`Arg_T`) + than their value type (`Option_T`). If `Arg_T` is different from `Option_T`, + the validator must perform the conversion from the former to the latter. + """ + + def __init__( + self, + default: Option_T | None = None, + validator: Callable[[Any, Arg_T | None], Option_T | None] | None = None, + ) -> None: + self._default = default + self._validator = validator + self._name: str + self._lookup: str + + def validate(self, validator: Callable[[Any, Any], Any] | None): + self._validator = validator + return validator + + @property + def default(self) -> Option_T | None: + return self._default + + def get(self, obj) -> Option_T | None: + val = getattr(obj, self._lookup, None) + if val is None: + return self._default + else: + return val + + def is_set(self, obj) -> bool: + return getattr(obj, self._lookup, None) is not None + + def __set_name__(self, owner, name: str): + self._name = name + self._lookup = f"_{name}" + + def __get__(self, obj, objtype=None) -> Option_T | None: + if obj is None: + return None + + return getattr(obj, self._lookup, None) + + def __set__(self, obj, arg: Arg_T | None): + if arg is not None and self._validator is not None: + value = self._validator(obj, arg) + else: + value = cast(Option_T, arg) + setattr(obj, self._lookup, value) + + def __delete__(self, obj): + delattr(obj, self._lookup) + + +class SimpleOption(Option[Option_T, Option_T]): + ... + + +class ConfigBase(ABC): + def get_option(self, name: str) -> Any: + """Get the value set for the specified option, or the option's default value if none has been set.""" + descr: Option = type(self).__dict__[name] + return descr.get(self) + + def is_option_set(self, name: str) -> bool: + descr: Option = type(self).__dict__[name] + return descr.is_set(self) + + def override(self, other: ConfigBase): + for f in fields(self): # type: ignore + fvalue = getattr(self, f.name) + if isinstance(fvalue, ConfigBase): # type: ignore + fvalue.override(getattr(other, f.name)) + else: + new_val = getattr(other, f.name) + if new_val is not None: + setattr(self, f.name, new_val) + + +Category_T = TypeVar("Category_T", bound=ConfigBase) + + +class Category(Generic[Category_T]): + """Descriptor for a category of options. + + This descriptor makes sure that when an entire category is set to an object, + that object is copied immediately such that later changes to the original + do not affect this configuration. + """ + + def __init__(self, default: Category_T): + self._default = default + + def __set_name__(self, owner, name: str): + self._name = name + self._lookup = f"_{name}" + + def __get__(self, obj, objtype=None) -> Category_T: + if obj is None: + return self._default + + return cast(Category_T, getattr(obj, self._lookup, None)) + + def __set__(self, obj, cat: Category_T): + setattr(obj, self._lookup, copy(cat)) + + class _AUTO_TYPE: ... # noqa: E701 diff --git a/tests/codegen/test_config.py b/tests/codegen/test_config.py new file mode 100644 index 000000000..613c279c3 --- /dev/null +++ b/tests/codegen/test_config.py @@ -0,0 +1,57 @@ +from dataclasses import dataclass +from pystencils.codegen.config import SimpleOption, Option, Category, ConfigBase + + +def test_descriptors(): + + @dataclass + class SampleCategory(ConfigBase): + val1: SimpleOption[int] = SimpleOption(2) + val2: Option[bool, str] = Option(False) + + @val2.validate + def _val2(self, v: str): + if v.lower() in ("off", "false", "no"): + return False + elif v.lower() in ("on", "true", "yes"): + return True + + raise ValueError() + + @dataclass + class SampleConfig(ConfigBase): + cat: Category[SampleCategory] = Category(SampleCategory()) + val: SimpleOption[str] = SimpleOption("fallback") + + cfg = SampleConfig() + + # Check unset and default values + assert cfg.val is None + assert cfg.get_option("val") == "fallback" + + # Check setting + cfg.val = "test" + assert cfg.val == "test" + assert cfg.get_option("val") == "test" + assert cfg.is_option_set("val") + + # Check unsetting + cfg.val = None + assert not cfg.is_option_set("val") + assert cfg.val is None + + # Check category + assert cfg.cat.val1 is None + assert cfg.cat.get_option("val1") == 2 + assert cfg.cat.val2 is None + assert cfg.cat.get_option("val2") is False + + # Check copy on category setting + c = SampleCategory(32, "on") + cfg.cat = c + assert cfg.cat.val1 == 32 + assert cfg.cat.val2 is True + + assert cfg.cat is not c + c.val1 = 13 + assert cfg.cat.val1 == 32 -- GitLab From 11c30f08cb2a00d8caa6ffb6f7ca47b351b3bda2 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 17 Jan 2025 20:20:29 +0100 Subject: [PATCH 2/7] Update config classes to use descriptors. Update parts of the codegen driver --- src/pystencils/__init__.py | 12 +- src/pystencils/backend/platforms/cuda.py | 8 +- src/pystencils/backend/platforms/sycl.py | 8 +- .../backend/transformations/add_pragmas.py | 4 +- src/pystencils/codegen/__init__.py | 12 +- src/pystencils/codegen/config.py | 282 ++++++++---------- src/pystencils/codegen/driver.py | 50 +++- tests/codegen/test_config.py | 25 +- tests/fixtures.py | 2 +- tests/kernelcreation/test_iteration_slices.py | 6 +- tests/nbackend/kernelcreation/test_openmp.py | 8 +- tests/test_quicktests.py | 2 +- 12 files changed, 215 insertions(+), 204 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 6cb375b61..2bb4aac3d 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -3,10 +3,10 @@ from .codegen import ( Target, CreateKernelConfig, - CpuOptimConfig, + CpuOptions, VectorizationConfig, - OpenMpConfig, - GpuIndexingConfig, + OpenMpOptions, + GpuOptions, AUTO ) from .defaults import DEFAULTS @@ -50,10 +50,10 @@ __all__ = [ "create_numeric_type", "make_slice", "CreateKernelConfig", - "CpuOptimConfig", + "CpuOptions", "VectorizationConfig", - "GpuIndexingConfig", - "OpenMpConfig", + "GpuOptions", + "OpenMpOptions", "AUTO", "create_kernel", "create_staggered_kernel", diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index f146cfbfd..31686cb18 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -30,7 +30,7 @@ from ..literals import PsLiteral from ..functions import PsMathFunction, MathFunctions, CFunction if TYPE_CHECKING: - from ...codegen import GpuIndexingConfig, GpuThreadsRange + from ...codegen import GpuOptions, GpuThreadsRange int32 = PsSignedIntegerType(width=32, const=False) @@ -52,13 +52,13 @@ class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs.""" def __init__( - self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None + self, ctx: KernelCreationContext, indexing_cfg: GpuOptions | None = None ) -> None: super().__init__(ctx) - from ...codegen.config import GpuIndexingConfig + from ...codegen.config import GpuOptions - self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig() + self._cfg = indexing_cfg if indexing_cfg is not None else GpuOptions() self._typify = Typifier(ctx) @property diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 9c04d6074..b5ba7b6c4 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -30,19 +30,19 @@ from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType if TYPE_CHECKING: - from ...codegen import GpuIndexingConfig, GpuThreadsRange + from ...codegen import GpuOptions, GpuThreadsRange class SyclPlatform(GenericGpu): def __init__( - self, ctx: KernelCreationContext, indexing_cfg: GpuIndexingConfig | None = None + self, ctx: KernelCreationContext, indexing_cfg: GpuOptions | None = None ): super().__init__(ctx) - from ...codegen.config import GpuIndexingConfig + from ...codegen.config import GpuOptions - self._cfg = indexing_cfg if indexing_cfg is not None else GpuIndexingConfig() + self._cfg = indexing_cfg if indexing_cfg is not None else GpuOptions() @property def required_headers(self) -> set[str]: diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 78e721f38..b033e4d58 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -11,7 +11,7 @@ from ..ast.structural import PsBlock, PsLoop, PsPragma from ..ast.expressions import PsExpression if TYPE_CHECKING: - from ...codegen.config import OpenMpConfig + from ...codegen.config import OpenMpOptions __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"] @@ -105,7 +105,7 @@ class AddOpenMP: `OpenMpConfig` configuration. """ - def __init__(self, ctx: KernelCreationContext, omp_params: OpenMpConfig) -> None: + def __init__(self, ctx: KernelCreationContext, omp_params: OpenMpOptions) -> None: pragma_text = "omp" pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py index e27b94b9e..da33f9ee2 100644 --- a/src/pystencils/codegen/__init__.py +++ b/src/pystencils/codegen/__init__.py @@ -1,10 +1,10 @@ from .target import Target from .config import ( CreateKernelConfig, - CpuOptimConfig, + CpuOptions, VectorizationConfig, - OpenMpConfig, - GpuIndexingConfig, + OpenMpOptions, + GpuOptions, AUTO, ) from .parameters import Parameter @@ -14,10 +14,10 @@ from .driver import create_kernel, get_driver __all__ = [ "Target", "CreateKernelConfig", - "CpuOptimConfig", + "CpuOptions", "VectorizationConfig", - "OpenMpConfig", - "GpuIndexingConfig", + "OpenMpOptions", + "GpuOptions", "AUTO", "Parameter", "Kernel", diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 83602dbbc..4e17ccc48 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -77,7 +77,7 @@ class Option(Generic[Option_T, Arg_T]): return self._default else: return val - + def is_set(self, obj) -> bool: return getattr(obj, self._lookup, None) is not None @@ -102,8 +102,7 @@ class Option(Generic[Option_T, Arg_T]): delattr(obj, self._lookup) -class SimpleOption(Option[Option_T, Option_T]): - ... +class BasicOption(Option[Option_T, Option_T]): ... class ConfigBase(ABC): @@ -111,7 +110,7 @@ class ConfigBase(ABC): """Get the value set for the specified option, or the option's default value if none has been set.""" descr: Option = type(self).__dict__[name] return descr.get(self) - + def is_option_set(self, name: str) -> bool: descr: Option = type(self).__dict__[name] return descr.is_set(self) @@ -132,7 +131,7 @@ Category_T = TypeVar("Category_T", bound=ConfigBase) class Category(Generic[Category_T]): """Descriptor for a category of options. - + This descriptor makes sure that when an entire category is set to an object, that object is copied immediately such that later changes to the original do not affect this configuration. @@ -168,22 +167,25 @@ Currently, these options permit `AUTO`: @dataclass -class OpenMpConfig: +class OpenMpOptions(ConfigBase): """Parameters controlling kernel parallelization using OpenMP.""" - nesting_depth: int = 0 + enable: BasicOption[bool] = BasicOption(False) + """Enable OpenMP instrumentation""" + + nesting_depth: BasicOption[int] = BasicOption(0) """Nesting depth of the loop that should be parallelized. Must be a nonnegative number.""" - collapse: int = 0 + collapse: BasicOption[int] = BasicOption() """Argument to the OpenMP ``collapse`` clause""" - schedule: str = "static" + schedule: BasicOption[str] = BasicOption("static") """Argument to the OpenMP ``schedule`` clause""" - num_threads: int | None = None + num_threads: BasicOption[int] = BasicOption() """Set the number of OpenMP threads to execute the parallel region.""" - omit_parallel_construct: bool = False + omit_parallel_construct: BasicOption[bool] = BasicOption(False) """If set to ``True``, the OpenMP ``parallel`` construct is omitted, producing just a ``#pragma omp for``. Use this option only if you intend to wrap the kernel into an external ``#pragma omp parallel`` region. @@ -197,62 +199,17 @@ class OpenMpConfig: @dataclass -class CpuOptimConfig: - """Configuration for the CPU optimizer. - - If any flag in this configuration is set to a value not supported by the CPU specified - in `CreateKernelConfig.target`, an error will be raised. - """ - - openmp: bool | OpenMpConfig = False - """Enable OpenMP parallelization. - - If set to `True`, the kernel will be parallelized using OpenMP according to the default settings in `OpenMpConfig`. - To customize OpenMP parallelization, pass an instance of `OpenMpConfig` instead. - """ - - vectorize: bool | VectorizationConfig = False - """Enable and configure auto-vectorization. - - If set to an instance of `VectorizationConfig` and a CPU target with vector capabilities is selected, - pystencils will attempt to vectorize the kernel according to the given vectorization options. - - If set to `True`, pystencils will infer vectorization options from the given CPU target. - - If set to `False`, no vectorization takes place. - """ - - loop_blocking: None | tuple[int, ...] = None - """Block sizes for loop blocking. - - If set, the kernel's loops will be tiled according to the given block sizes. - """ - - use_cacheline_zeroing: bool = False - """Enable cache-line zeroing. - - If set to `True` and the selected CPU supports cacheline zeroing, the CPU optimizer will attempt - to produce cacheline zeroing instructions where possible. - """ - - def get_vectorization_config(self) -> VectorizationConfig | None: - if self.vectorize is True: - return VectorizationConfig() - elif isinstance(self.vectorize, VectorizationConfig): - return self.vectorize - else: - return None - - -@dataclass -class VectorizationConfig: +class VectorizationConfig(ConfigBase): """Configuration for the auto-vectorizer. If any flag in this configuration is set to a value not supported by the CPU specified in `CreateKernelConfig.target`, an error will be raised. """ - lanes: int | None = None + enable: BasicOption[bool] = BasicOption(False) + """Enable intrinsic vectorization.""" + + lanes: BasicOption[int] = BasicOption() """Number of SIMD lanes to be used in vectorization. If set to `None` (the default), the vector register width will be automatically set to the broadest possible. @@ -261,7 +218,9 @@ class VectorizationConfig: operation contained in the kernel with the given number of lanes, an error will be raised. """ - use_nontemporal_stores: bool | Collection[str | Field] = False + use_nontemporal_stores: BasicOption[bool | Collection[str | Field]] = BasicOption( + False + ) """Enable nontemporal (streaming) stores. If set to `True` and the selected CPU supports streaming stores, the vectorizer will generate @@ -271,14 +230,14 @@ class VectorizationConfig: the given fields. """ - assume_aligned: bool = False + assume_aligned: BasicOption[bool] = BasicOption(False) """Assume field pointer alignment. If set to `True`, the vectorizer will assume that the address of the first inner entry (after ghost layers) of each field is aligned at the necessary byte boundary. """ - assume_inner_stride_one: bool = False + assume_inner_stride_one: BasicOption[bool] = BasicOption(False) """Assume stride associated with the innermost spatial coordinate of all fields is one. If set to `True`, the vectorizer will replace the stride of the innermost spatial coordinate @@ -307,10 +266,48 @@ class VectorizationConfig: @dataclass -class GpuIndexingConfig: +class CpuOptions(ConfigBase): + """Configuration for the CPU optimizer. + + If any flag in this configuration is set to a value not supported by the CPU specified + in `CreateKernelConfig.target`, an error will be raised. + """ + + openmp: Category[OpenMpOptions] = Category(OpenMpOptions()) + """Options governing OpenMP-instrumentation. + """ + + vectorize: Category[OpenMpOptions] = Category(OpenMpOptions()) + """Options governing intrinsic vectorization. + """ + + loop_blocking: BasicOption[tuple[int, ...]] = BasicOption() + """Block sizes for loop blocking. + + If set, the kernel's loops will be tiled according to the given block sizes. + """ + + use_cacheline_zeroing: BasicOption[bool] = BasicOption(False) + """Enable cache-line zeroing. + + If set to `True` and the selected CPU supports cacheline zeroing, the CPU optimizer will attempt + to produce cacheline zeroing instructions where possible. + """ + + def get_vectorization_config(self) -> VectorizationConfig | None: + if self.vectorize is True: + return VectorizationConfig() + elif isinstance(self.vectorize, VectorizationConfig): + return self.vectorize + else: + return None + + +@dataclass +class GpuOptions(ConfigBase): """Configure index translation behaviour for kernels generated for GPU targets.""" - omit_range_check: bool = False + omit_range_check: BasicOption[bool] = BasicOption(False) """If set to `True`, omit the iteration counter range check. By default, the code generator introduces a check if the iteration counters computed from GPU block and thread @@ -318,10 +315,10 @@ class GpuIndexingConfig: This check can be discarded through this option, at your own peril. """ - block_size: tuple[int, int, int] | None = None + block_size: BasicOption[tuple[int, int, int]] = BasicOption() """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system.""" - manual_launch_grid: bool = False + manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. If set to `True`, the code generator will not attempt to infer the size of @@ -329,7 +326,7 @@ class GpuIndexingConfig: The launch grid will then have to be specified manually at runtime. """ - sycl_automatic_block_size: bool = True + sycl_automatic_block_size: BasicOption[bool] = BasicOption(True) """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size. If set to `True`, the kernel is generated for execution via @@ -343,24 +340,30 @@ class GpuIndexingConfig: """ +GhostLayerSpec = _AUTO_TYPE | int | Sequence[int | tuple[int, int]] + + +IterationSliceSpec = int | slice | tuple[int | slice] + + @dataclass -class CreateKernelConfig: +class CreateKernelConfig(ConfigBase): """Options for create_kernel.""" - target: Target = Target.GenericCPU + target: BasicOption[Target] = BasicOption(Target.GenericCPU) """The code generation target.""" - jit: JitBase | None = None + jit: BasicOption[JitBase] = BasicOption() """Just-in-time compiler used to compile and load the kernel for invocation from the current Python environment. If left at `None`, a default just-in-time compiler will be inferred from the `target` parameter. To explicitly disable JIT compilation, pass `pystencils.no_jit <pystencils.jit.no_jit>`. """ - function_name: str = "kernel" + function_name: BasicOption[str] = BasicOption("kernel") """Name of the generated function""" - ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None + ghost_layers: BasicOption[GhostLayerSpec] = BasicOption() """Specifies the number of ghost layers of the iteration region. Options: @@ -376,7 +379,7 @@ class CreateKernelConfig: At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set. """ - iteration_slice: None | int | slice | tuple[int | slice] = None + iteration_slice: BasicOption[IterationSliceSpec] = BasicOption() """Specifies the kernel's iteration slice. Example: @@ -390,7 +393,7 @@ class CreateKernelConfig: At most one of `ghost_layers`, `iteration_slice`, and `index_field` may be set. """ - index_field: Field | None = None + index_field: BasicOption[Field] = BasicOption() """Index field for a sparse kernel. If this option is set, a sparse kernel with the given field as index field will be generated. @@ -401,10 +404,10 @@ class CreateKernelConfig: """Data Types""" - index_dtype: UserTypeSpec = DEFAULTS.index_dtype + index_dtype: Option[PsIntegerType, UserTypeSpec] = Option(DEFAULTS.index_dtype) """Data type used for all index calculations.""" - default_dtype: UserTypeSpec = PsIeeeFloatType(64) + default_dtype: Option[PsScalarType, UserTypeSpec] = Option(DEFAULTS.numeric_dtype) """Default numeric data type. This data type will be applied to all untyped symbols. @@ -412,14 +415,14 @@ class CreateKernelConfig: """Analysis""" - allow_double_writes: bool = False + allow_double_writes: BasicOption[bool] = BasicOption(False) """ If True, don't check if every field is only written at a single location. This is required for example for kernels that are compiled with loop step sizes > 1, that handle multiple cells at once. Use with care! """ - skip_independence_check: bool = False + skip_independence_check: BasicOption[bool] = BasicOption(False) """ By default the assignment list is checked for read/write independence. This means fields are only written at locations where they are read. Doing so guarantees thread safety. In some cases e.g. for @@ -428,17 +431,33 @@ class CreateKernelConfig: """Target-Specific Options""" - cpu_optim: None | CpuOptimConfig = None - """Configuration of the CPU kernel optimizer. - - If this parameter is set while `target` is a non-CPU target, an error will be raised. - """ - - gpu_indexing: None | GpuIndexingConfig = None - """Configure index translation for GPU kernels. - - It this parameter is set while `target` is not a GPU target, an error will be raised. - """ + cpu_optim: Category[CpuOptions] = Category(CpuOptions()) + """Options for CPU kernels.""" + + gpu_indexing: Category[GpuOptions] = Category(GpuOptions()) + """Options for GPU Kernels.""" + + @index_dtype.validate + def validate_index_type(self, spec: UserTypeSpec): + dtype = create_type(spec) + if not isinstance(dtype, PsIntegerType): + raise ValueError("index_dtype must be an integer type") + return dtype + + @default_dtype.validate + def validate_default_dtype(self, spec: UserTypeSpec): + dtype = create_type(spec) + if not isinstance(dtype, PsScalarType): + raise ValueError("default_dtype must be a scalar numeric type") + return dtype + + @index_field.validate + def validate_index_field(self, idx_field: Field): + if idx_field.field_type != FieldType.INDEXED: + raise ValueError( + "Only fields of type FieldType.INDEXED can be used as index fields" + ) + return idx_field # Deprecated Options @@ -457,20 +476,23 @@ class CreateKernelConfig: # Getters def get_target(self) -> Target: - match self.target: + t: Target = self.get_option("target") + match t: case Target.CurrentCPU: return Target.auto_cpu() case _: - return self.target + return t def get_jit(self) -> JitBase: """Returns either the user-specified JIT compiler, or infers one from the target if none is given.""" - if self.jit is None: - if self.target.is_cpu(): + jit: JitBase | None = self.get_option("jit") + + if jit is None: + if self.get_target().is_cpu(): from ..jit import LegacyCpuJit return LegacyCpuJit() - elif self.target == Target.CUDA: + elif self.get_target() == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit @@ -487,7 +509,7 @@ class CreateKernelConfig: return no_jit - elif self.target == Target.SYCL: + elif self.get_target() == Target.SYCL: from ..jit import no_jit return no_jit @@ -496,64 +518,14 @@ class CreateKernelConfig: f"No default JIT compiler implemented yet for target {self.target}" ) else: - return self.jit + return jit # Postprocessing def __post_init__(self, *args): - # Check deprecated options self._check_deprecations(*args) - # Check index data type - if not isinstance(create_type(self.index_dtype), PsIntegerType): - raise PsOptionsError("`index_dtype` was not an integer type.") - - # Check iteration space argument consistency - if ( - int(self.iteration_slice is not None) - + int(self.ghost_layers is not None) - + int(self.index_field is not None) - > 1 - ): - raise PsOptionsError( - "Parameters `iteration_slice`, `ghost_layers` and 'index_field` are mutually exclusive; " - "at most one of them may be set." - ) - - # Check index field - if ( - self.index_field is not None - and self.index_field.field_type != FieldType.INDEXED - ): - raise PsOptionsError( - "Only fields with `field_type == FieldType.INDEXED` can be specified as `index_field`" - ) - - # Check optim - if self.cpu_optim is not None: - if ( - self.cpu_optim.vectorize is not False - and not self.target.is_vector_cpu() - ): - raise PsOptionsError( - f"Cannot enable auto-vectorization for non-vector CPU target {self.target}" - ) - - if self.gpu_indexing is not None: - if isinstance(self.gpu_indexing, str): - match self.gpu_indexing: - case "block": - self.gpu_indexing = GpuIndexingConfig() - case "line": - raise NotImplementedError( - "GPU line indexing is currently unavailable." - ) - case other: - raise PsOptionsError( - f"Invalid value for option gpu_indexing: {other}" - ) - def _check_deprecations( self, data_type: UserTypeSpec | None, @@ -561,7 +533,7 @@ class CreateKernelConfig: cpu_vectorize_info: dict | None, gpu_indexing_params: dict | None, ): # pragma: no cover - optim: CpuOptimConfig | None = None + optim: CpuOptions | None = None if data_type is not None: _deprecated_option("data_type", "default_dtype") @@ -575,20 +547,20 @@ class CreateKernelConfig: if cpu_openmp is not None: _deprecated_option("cpu_openmp", "cpu_optim.openmp") - deprecated_omp: OpenMpConfig | bool + deprecated_omp: OpenMpOptions | bool match cpu_openmp: case True: - deprecated_omp = OpenMpConfig() + deprecated_omp = OpenMpOptions() case False: deprecated_omp = False case int(): - deprecated_omp = OpenMpConfig(num_threads=cpu_openmp) + deprecated_omp = OpenMpOptions(num_threads=cpu_openmp) case _: raise PsOptionsError( f"Invalid option for `cpu_openmp`: {cpu_openmp}" ) - optim = CpuOptimConfig(openmp=deprecated_omp) + optim = CpuOptions(openmp=deprecated_omp) if cpu_vectorize_info is not None: _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize") @@ -637,7 +609,7 @@ class CreateKernelConfig: if optim is not None: optim = replace(optim, vectorize=deprecated_vec_opts) else: - optim = CpuOptimConfig(vectorize=deprecated_vec_opts) + optim = CpuOptions(vectorize=deprecated_vec_opts) if optim is not None: if self.cpu_optim is not None: @@ -655,7 +627,7 @@ class CreateKernelConfig: "Cannot specify both `gpu_indexing` and the deprecated `gpu_indexing_params` at the same time." ) - self.gpu_indexing = GpuIndexingConfig( + self.gpu_indexing = GpuOptions( block_size=gpu_indexing_params.get("block_size", None) ) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 28b685b55..47bcb905d 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -3,12 +3,13 @@ from typing import cast, Sequence, Iterable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target -from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO +from .config import CreateKernelConfig, OpenMpOptions, VectorizationConfig, AUTO, GhostLayerSpec, IterationSliceSpec from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter -from ..types import create_numeric_type, PsIntegerType, PsScalarType +from ..field import Field +from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode @@ -105,15 +106,36 @@ class DefaultKernelCreationDriver: def __init__(self, cfg: CreateKernelConfig, retain_intermediates: bool = False): self._cfg = cfg - idx_dtype = create_numeric_type(self._cfg.index_dtype) - assert isinstance(idx_dtype, PsIntegerType) + # Data Type Options + idx_dtype: PsIntegerType = cfg.get_option("index_dtype") + default_dtype: PsScalarType = cfg.get_option("default_dtype") + + # Iteration Space Options + num_ispace_options_set = ( + int(cfg.is_option_set("ghost_layers")) + + int(cfg.is_option_set("iteration_slice")) + + int(cfg.is_option_set("index_field")) + ) + + if num_ispace_options_set > 1: + raise ValueError( + "At most one of the options 'ghost_layers' 'iteration_slice' and 'index_field' may be set." + ) + + self._ghost_layers: GhostLayerSpec | None = cfg.get_option("ghost_layers") + self._iteration_slice: IterationSliceSpec | None = cfg.get_option("iteration_slice") + self._index_field: Field | None = cfg.get_option("index_field") + + if num_ispace_options_set == 0: + self._ghost_layers = AUTO + # Create the context self._ctx = KernelCreationContext( - default_dtype=create_numeric_type(self._cfg.default_dtype), + default_dtype=default_dtype, index_dtype=idx_dtype, ) - self._target = self._cfg.get_target() + self._target = cfg.get_target() self._platform = self._get_platform() self._intermediates: CodegenIntermediates | None @@ -153,7 +175,7 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Target-Specific optimizations - if self._cfg.target.is_cpu(): + if self._target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) # Note: After this point, the AST may contain intrinsics, so type-dependent @@ -174,13 +196,13 @@ class DefaultKernelCreationDriver: canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) - if self._cfg.target.is_cpu(): + if self._target.is_cpu(): return create_cpu_kernel_function( self._ctx, self._platform, kernel_ast, - self._cfg.function_name, - self._cfg.target, + self._cfg.get_option("function_name"), + self._target, self._cfg.get_jit(), ) else: @@ -189,8 +211,8 @@ class DefaultKernelCreationDriver: self._platform, kernel_ast, gpu_threads, - self._cfg.function_name, - self._cfg.target, + self._cfg.get_option("function_name"), + self._target, self._cfg.get_jit(), ) @@ -272,8 +294,8 @@ class DefaultKernelCreationDriver: params = ( cpu_cfg.openmp - if isinstance(cpu_cfg.openmp, OpenMpConfig) - else OpenMpConfig() + if isinstance(cpu_cfg.openmp, OpenMpOptions) + else OpenMpOptions() ) add_omp = AddOpenMP(self._ctx, params) kernel_ast = cast(PsBlock, add_omp(kernel_ast)) diff --git a/tests/codegen/test_config.py b/tests/codegen/test_config.py index 613c279c3..715830e70 100644 --- a/tests/codegen/test_config.py +++ b/tests/codegen/test_config.py @@ -1,16 +1,20 @@ +import pytest + from dataclasses import dataclass -from pystencils.codegen.config import SimpleOption, Option, Category, ConfigBase +import numpy as np +from pystencils.codegen.config import BasicOption, Option, Category, ConfigBase, CreateKernelConfig +from pystencils.types.quick import Int, UInt def test_descriptors(): @dataclass class SampleCategory(ConfigBase): - val1: SimpleOption[int] = SimpleOption(2) + val1: BasicOption[int] = BasicOption(2) val2: Option[bool, str] = Option(False) @val2.validate - def _val2(self, v: str): + def validate_val2(self, v: str): if v.lower() in ("off", "false", "no"): return False elif v.lower() in ("on", "true", "yes"): @@ -21,7 +25,7 @@ def test_descriptors(): @dataclass class SampleConfig(ConfigBase): cat: Category[SampleCategory] = Category(SampleCategory()) - val: SimpleOption[str] = SimpleOption("fallback") + val: BasicOption[str] = BasicOption("fallback") cfg = SampleConfig() @@ -55,3 +59,16 @@ def test_descriptors(): assert cfg.cat is not c c.val1 = 13 assert cfg.cat.val1 == 32 + + +def test_config_validation(): + cfg = CreateKernelConfig(index_dtype="int32") + assert cfg.index_dtype == Int(32) + cfg.index_dtype = np.uint64 + assert cfg.index_dtype == UInt(64) + + with pytest.raises(ValueError): + _ = CreateKernelConfig(index_dtype=np.float32) + + with pytest.raises(ValueError): + cfg.index_dtype = "double" diff --git a/tests/fixtures.py b/tests/fixtures.py index 7c9521614..8c7f12015 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -49,7 +49,7 @@ def gen_config(target: ps.Target): if target.is_vector_cpu(): gen_config = replace( gen_config, - cpu_optim=ps.CpuOptimConfig( + cpu_optim=ps.CpuOptions( vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) ), ) diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index fee3544f8..5c7b4d8cb 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -13,7 +13,7 @@ from pystencils import ( make_slice, Target, CreateKernelConfig, - GpuIndexingConfig, + GpuOptions, DynamicType, ) from pystencils.sympyextensions.integer_functions import int_rem @@ -141,7 +141,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): if gen_config.target == Target.CUDA: gen_config = replace( - gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + gen_config, gpu_indexing=GpuOptions(manual_launch_grid=True) ) kernel = create_kernel(update, gen_config).compile() @@ -174,7 +174,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): if gen_config.target == Target.CUDA: gen_config = replace( - gen_config, gpu_indexing=GpuIndexingConfig(manual_launch_grid=True) + gen_config, gpu_indexing=GpuOptions(manual_launch_grid=True) ) try: diff --git a/tests/nbackend/kernelcreation/test_openmp.py b/tests/nbackend/kernelcreation/test_openmp.py index d7be8eb98..ae775ca20 100644 --- a/tests/nbackend/kernelcreation/test_openmp.py +++ b/tests/nbackend/kernelcreation/test_openmp.py @@ -4,8 +4,8 @@ from pystencils import ( Assignment, create_kernel, CreateKernelConfig, - CpuOptimConfig, - OpenMpConfig, + CpuOptions, + OpenMpOptions, Target, ) @@ -21,14 +21,14 @@ def test_openmp(nesting_depth, schedule, collapse, omit_parallel_construct): f, g = fields("f, g: [3D]") asm = Assignment(f.center(0), g.center(0)) - omp = OpenMpConfig( + omp = OpenMpOptions( nesting_depth=nesting_depth, schedule=schedule, collapse=collapse, omit_parallel_construct=omit_parallel_construct, ) gen_config = CreateKernelConfig( - target=Target.CPU, cpu_optim=CpuOptimConfig(openmp=omp) + target=Target.CPU, cpu_optim=CpuOptions(openmp=omp) ) kernel = create_kernel(asm, gen_config) diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py index 5d5dba0ea..3e7f4f071 100644 --- a/tests/test_quicktests.py +++ b/tests/test_quicktests.py @@ -74,7 +74,7 @@ def test_basic_vectorization(): ast = ps.create_kernel( update_rule, target=target, - cpu_optim=ps.CpuOptimConfig( + cpu_optim=ps.CpuOptions( vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) ), ) -- GitLab From d8610f6bf4a25fac4dd04aaad1f3cad4b0a484d6 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 20 Jan 2025 14:09:06 +0100 Subject: [PATCH 3/7] Fix deepcopy of categories. Remove mentions of `config` from the backend. Adapt test suite. --- .../01_tutorial_getting_started.ipynb | 298 +++++++++--------- src/pystencils/__init__.py | 4 +- .../backend/kernelcreation/iteration_space.py | 64 +++- src/pystencils/backend/platforms/cuda.py | 16 +- src/pystencils/backend/platforms/sycl.py | 24 +- .../backend/transformations/add_pragmas.py | 37 ++- src/pystencils/codegen/__init__.py | 4 +- src/pystencils/codegen/config.py | 160 ++++++---- src/pystencils/codegen/driver.py | 107 +++++-- src/pystencils/types/types.py | 2 +- tests/codegen/test_config.py | 88 +++++- tests/fixtures.py | 17 +- tests/kernelcreation/test_buffer_gpu.py | 4 +- tests/kernelcreation/test_gpu.py | 6 +- tests/kernelcreation/test_iteration_slices.py | 13 +- tests/nbackend/kernelcreation/test_openmp.py | 7 +- tests/nbackend/kernelcreation/test_options.py | 28 -- .../transformations/test_add_pragmas.py | 1 + tests/test_quicktests.py | 4 +- 19 files changed, 524 insertions(+), 360 deletions(-) delete mode 100644 tests/nbackend/kernelcreation/test_options.py diff --git a/docs/source/tutorials/01_tutorial_getting_started.ipynb b/docs/source/tutorials/01_tutorial_getting_started.ipynb index 5ce765fce..04dc50e51 100644 --- a/docs/source/tutorials/01_tutorial_getting_started.ipynb +++ b/docs/source/tutorials/01_tutorial_getting_started.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 37, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -63,14 +63,14 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "4.74 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "3.91 ms ± 88.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -88,22 +88,19 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdoAAAAnCAYAAACon4ncAAAACXBIWXMAAA7EAAAOxAGVKw4bAAANI0lEQVR4Ae2d7bXctBaGJ2elgAAdcDvILYHQAZAKEjoIi3/5x+J2AFQQoAO4FYSbDqADwung3PfRWI4sW7bk8Yc8s7WWR19bW9K7JW9vSWM/enh4OJmrE4HXr19/r5b93bTuX/K/Udo9cflP5T3X9VThz3V9qvA3usj/U/Ef5Xec0kJ+0P2htHcdIos4BCKsLsI+4mW4j4yxCKuLcKeaiJ9hP4L9nKwI34vkFfG6KlndzQHXyqyPgAbdb6rljfz/cCn8iy4UpXfPlY5iPcn/Qd7X8r9uMkM68j/V9afy3sr3/P6teIeuKXvznjBaBHvDvWwoLYU7tRr2ZdjPoV5KXrcgK1O0c0bYymUYeKrimfzQ2kSp/o+qm3wUJ+5jXdA6pavwE12xNYvi+FU0v8r37isFSO840XzR8O+kLxUR71dL8VqDT9P3pbA33DOFtDDu1FoN9pkQdMhsnpx2uT91hJAZyZHV40xeRrYDAhIgNwss2d8V/jxoAsr05ybOErK3ZE+ia8PkK461i+L+jrh3Sv/Ih72vtJcK/yX/ryCNuhj0WMxYwZNOdN5SZtn7E8X9Q8BJYSzqH3R12jnJdGMCte8i7OmjmnwJ7kkMU1CozqSslHcTuIPNHtinZDKWbvK6XFZj+KbyxnAfKZOcj+I3ObfuUowtfT8EJDgUHcvFz3Rxw2bP1Qv6pPA7Xfe6yMd5pXuOdX9Rkijq+25yN6Z8btLs9f7ucxRGiaN8yeOadCoD/d/y2U+mD7/JR2mF7heltf0JM/YOq11LYX8J7jkYdqBSu3NkdQu4g8um2HcEkRkxebVAzZZVy6EgkIl7h6PK5MzH0bllirYDaT0RCRdF9UgtwpJF+b1SPFZ2XyodC/Refs819JSJFV2PVgkoPpR661QehY6ybC3cNjMdgE+orAmzFIt155zCPi3ujyfZ1Vf7LsJe5enXbNxVdhLDGCDVOSmra8cdTPbAPpZFTtzktYiscqDu0OTg3ilwjkzOR/EdvaeZoh1Adc8kCex7XX7/9YQAdYXLxmHzsGjDfdcwj7L3TYL343xvEZOOMmwVZIcwM6LyKFMUTKyYqR+LK3RuYIYJe4fV/kWwvwT3QgznQHa1uANG5dibvAIELpFVwGbVYOF8TM4tU7SrimkWc79c3BaWsL9Q5MdgYJ6aAYBim7JWORiF5dtxKs9T2nsSG16DyrhTaDqCkh1y1PNxlPFW8dQDRES6WXRJ7OfiXoLhHGCuHXcwqRV7k1cfgbmy6nNaJ6VkPibn1uN12mZcL0DghcpiXfq9UVixHxsfHkLJsmw8aoVSTheWGorVW8oMnu+Udi8fBy+ndF1s+R+UbDxgqZt6a3KLYb8C7kMYzsHuqnEHkIqxN3lFCKwgq6iG1aJD8zE5t0zRriaHeYw18PhLD9eoEx0Klj+ITzrRtqd+E8QoQQbJpS7FA/7xcvKain1WP5bGfibuJRjO6efV4w4oW2Kvup6qSh5kpxwPvfE8mCpz9fKaKavTyrh7uZTMx6SsRhVt05GfVCOWx8+Kx1aVb4z5x0aAyY8yvMhpfPiDWYyXdxGzOA5N6U0nYnn4aA/3QgznAGC4n1FbDHvJjLG91jaIyet8n+jdn1bG3Y2SwvmYlNXd2EylI7r47ySdnNoLHGSl8k908fcUGmGuQgQkG24UxfJpZBs/yfN/XfY6nRMNYV6WEStV6vNL2WfiG/sdwX0UwwTuuejdPO4AtRP2uTIK6W5eXiOyCnFaJJyYW6PzMag4Kau7gGgwqIr9TXN0L3Cw8DmR8jTg/QiNZe2PANYocmodcV0o0m91ubDir1qCs1xfKo3DWs4pzN+BeEkF+8LQchCLvc/YYQEkT0zHxFNx1eX3tadIa8sfwn0KQ+QU4z4lK99vw90jcT7jEI/5Yuw/sMsPabyavPLhgrI3T8qKn6kzcB+aW1NjwjclObcee4oRn8J+SXCELJl1afkk4y0zJCAOJ13zS/hRqGwNtPu56jNWaBtXuOOUjyX8kfxW0UKgeLJMk++WgUQXW7lkz3Xw7C0vzWW2Ybke7tQ9hqHyerg3WBruZYJbBPuyKs/UJq9i1AZlVcplCvehuUUdSr9obt1lNBSLdK41C3vKL2a5wHAnhxJaUjHs1I3hajWQkDHL/J0n/GHqD6minzM+/KT5wOhGQ4b7foLfGPs5HbV50qA2V1ZzQF/jntZRtKqA5Qz3Kin5vImI03Rcg/uzynfLgw0t76/1lgrLePChHDduPuVGPFx2VPIxnNqNMrnEqj9ER9VPHiY4GVliGTJm7nM7KFpWBhgLV/vQkouFpxMWhrsHY2N/C+zndMnmSR+1mbLqM5pOWfye1ipadQJlwtdheP2cf08tT1S4nkUrGpToG/m8UJk1bF5+7+gV521G7M25V/opzDt0v9QF3aGc2ozSoV8vDtXwmY1Vf0eXSGK2oucP5yWO0+u98VTC4BppDff9pLoB9nM6Z/NkALVSWQ2wmExSHYvf05yiFWOUCYoSBRtaGoTdC+zD1okGKxWrlb0i77hBu8+4+QT57M+GNEHWOSge1X6WTW3Dmv+vrhcKZ1ttvU4eLGHNvq7J+2Aw95q7JjZr8u515IAJteFTW3tqEmlt2OS059HDw8NJhFieL+XzEvvWKc7fL/hrRsfKURxFSx6WCQoaCzZU0EpyG8iD5V3mOZ9lRJZkWwtHYW9F9z6x5svF/lQZ5Rd/lq3hyVI3CvZ9XGdBnIeXa9ijLujyeqTIUtxZfYndx03CkKx4WOy9hjJmYPE0AoZ7Gpsac0xedUnFK9p/1CxO1GKBOqcwVi7pnU+nnXOdEkUhhnuuLCG3CjmjPPx/El17A1TYvXZQvltils8NFUXVtsvX7/2cMg0f+tG2z5cf8xve7J99pvDqFq3qeBhrzy3kCYPOw15un1WOk8/srczanjDs3Zwuxt5wzx2hy9IJ92JZ0QKT17JyyOX2WMCj8LjiA09fwUT5rbVJ3Dulo7RQgk4ZKszhqfD9uc7qCMsrzKlWr7BQ1G4P1/OUT9pnPk5ZXXzPlBtoz2Ju6CbLNHw4uBXW76tJ+qLnRf5/iIDl46yPnieZZWSorlmTJ4O1kUwgYNhPALRStuG+ErArsTV5zQPW7dE2RWNF1u6vCtx2H1VhFBZLws4pnPqMW1seQtE5i+Ncyv0+o6yPK8xyNAo/bgeKmb3SnissQ11DS449vmGC6mCPmaVnFLo5Q8AQMAQMAUOgCIE7KRAUGUoIReec0lCKKDesORzLrl4Boqw6lmhD3/mMm2jYM3NllI8CxSp1B6PkUxf1hg6aIceem99/i/NLyiQ/YRQzjeNqL6fQOi9liGksbggYAoaAIWAIDCHwuElkn5T9UvZcP9H1RhcWqfufrPxQsb5QHGvU7acqjLtXnL3M0LG0jCUI3Ul+uHeGoh06tAJp7FCyKYUa0/r4UBkUe/sw4QkLfA6FdazwgrJGaggYAoaAIXCjCDhFKwWCEmoPJQVY9NJEi1XqLNOArhcUHdYsynrIoTipM3Rx3OdB661pn+b9kjK5it3z7vjqT9FBqk5hixgChoAhYAjcLAJ3O/UcxYkCbV2jmFGcQ1bnoGIvLAPflMJu23HrAWHKW7zCFYyjQMLYST14HaIPB8X+8LiPDY6DymSsS1crr5pl5ZeOxwSzeJ4A4X+NQwr1O1XGHrBTrKIhzJKtU5Dynyj+rfzQuhwtI3rvqK89xOUTze8hwP+iBx9sepQVJWhMtAfrKmpWaVMOh/2V4D4mp8PJZKwzVy6vamV1NyaUlfN6nz3SIGAfd+wTayjLl6JrDyZllPHdYBnbXhzh0RjwhWX4v+gBCktaCwHDfi1k5/M1mczHbuuStctqF4u2EQJ/l+EAVWidngRYJx4KTHlYWrV8li1s2uHDwvapOnG1y0o1C8iwr086JpP6ZJJq0RFktZtFK3BY6rPPsqVGz/bpzyWT0pdpb9/K66zRsK9PriaT+mSSalH1stpN0YKYbuxYtPZZttTw2ShdcmDJ+IgHoDZCaL1qDPv1sJ3L2WQyF7ntyx1FVrsqWsQioJJLxUNiE32p1WWfmxoCskkTnux78z9oO5E9gtMaWYb9GqhextNkchl+W5Y+kqx2V7QIRoCxN7iKW5P3Kg3enikrCqUPL9u38jprNOzrk6vJpD6ZpFp0GFlVoWhTKFr6ughIwfLWLlsyXhfmQe6G/SAsuyaaTHaFv6jyo8nKFG2ReK+HWAOVJWMOo9mS8cZiNew3BjyjOpNJBkiVkBxRVu57tJXgZ83YEAENVl4GwmG02PEfZZbyORXOf52L9tBjZhbvI2DY9zHZO8VksrcE8us/oqxM0ebL9yYoNYj/UUf59GHvPdc3AcCOnTTsdwQ/UbXJJAFMhck1y8qWjiscMDs3iddccpnbHgHDfnvMp2o0mUwhVE9+tbIyi7aeQbJrS/Q0yKEo9m1ZUsbxusq3Sg8/b+gy7GdZBAz7ZfFcgpvJZAkUt+FxBFn9HxdoFR2CYF4nAAAAAElFTkSuQmCC", "text/latex": [ "$\\displaystyle {dst}_{(0,0)} \\leftarrow_{} \\frac{{src}_{(1,0)}}{4} + \\frac{{src}_{(0,1)}}{4} + \\frac{{src}_{(0,-1)}}{4} + \\frac{{src}_{(-1,0)}}{4}$" ], "text/plain": [ - " src_E src_N src_S src_W\n", - "dst_C := ───── + ───── + ───── + ─────\n", - " 4 4 4 4 " + "Assignment(dst_C, src_E/4 + src_N/4 + src_S/4 + src_W/4)" ] }, - "execution_count": 41, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -118,12 +115,12 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAARUAAAEnCAYAAACHXNdEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAPWklEQVR4nO3df5DU9X3H8df+ut37sdzBnhxgFWTEQK6RXzpT0xLwRkioYmYcZrSmYslYtZm20kmmQtXO2Ngpf6C1SfAnYvQmyehojVEz1jhtbceJMhYLoRHPE0GQAPcD7o7jdm93v9/O93v8ENjf9wYO9/mY2bld7u6768f9Pu/z/ez39gKu67oCACNBqw0BAFEBYI6ZCgBTYdvN4XzXn3b0u5Sj4RJW2kKSJkaDaq7hZxNOICrwbRvI6J93DunXBzPKljkmX2kI6c8viunrF9QwmlCAV3/w24GMVmw9rIFs5S8EenOVdTPrde1EwlLtmLdCT+1JjiooHkfSD3YNMZogKtUu67r69560ybZ2DjnqHCz34AlfNMxUqlx/xtURb5phxFvkRXUjKlUubdyANCdoVz2igoIO/Xyjdt22SB1tE9X99FpGC0URFRQUTrQosfJuxRcuY6RQEs5TQUENC671Pw6+8ytGCiVhpgLAFFEBYIqoADBFVACYIiooyM1k5KSScrNZKZs9cR3Ig6igoJ72depcMkX9r7Wrt/3BketvPMeoIS9+S7nKHUg5WvBun9n2Hm2tV1uC31SuZsxUAJgiKgBMcUYt8upYOKHo6Fz2Vi8jiJMQFeRFMFAJDn9QkqFtm9SxKKGeZ9cxYiiIqKAo13HUtf4exWbOZbRQFIc/KKrvlWcUmzVfzuAAo4WimKmgoGxfrw6+8JgSK9cwUigJUUFB3Rse0PjldyoUb2SkUBKigrySHVuV3P6+Gq9bwSihZKypIK+hLW9reHendixv9W87h/ulUFjpzz7RpDXrGTnkRFSQV+OyWxVvu0H9/f3q7upS5F9/pIZLvqQJN69i1JAXUUFewVidfxkYGJIaE0orqECsjvUVFERUUFAqmVQqlRq5sfI+RVpaGDEUxEItCjrUd+jk24dOvg2ciqggL8dx1N/Xf9K/ebMWb/YC5ENUqlykwDNgoL9fbo4/Y3rq7OWk7QUCVg8N5ymiUuXGhQOqD+X+XL5DHW/24s1icpkc5SlV7XgGVLlQIKCrJ0QKL9Cewpu9eLOYU02rDerSfIVC1SAq0G0XxfwZS6mHOLlmMd4TadW0WkYTRAXSrIawfvyVBn1tfFheW3It0J7q2IKtl6I58ZB+8OV6Lb2AN7wG76aPUxzOuPrt/h4t/sYf+2fS5hMIBvX8z36qq+ddrkQNE16cwJ/oQE59fX1KHn3peNu2bbrmmmt000036eGHH/b/LRQKqbm5mdHDaTijFjk1Njb6F8++ffv8j01NTWrhjFoUwbwVgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioAiAqAsYuZCgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYCpc7Av2JrPaccRRynGLbiwYkBKRoFrjIYUCAavHCOAM+Wgwq9+lHKVL2L8jwYAmR4OaUR+qLCpb+zP6h84j+s3hbNkPNBEJ6E+nRPWdqbVlfy+AM++XB4b18M4h7Uo6ZX/v1FhQd02r1bUTa0qPilevb//msAayxeuVS0/a1b/sSirtyr9zAGPHG13D+u72QZWfkxFeiL63fVCRgLTkgprS1lR+ujdVcVA+78d7kkoZbAeAnQ17khUH5Rjn6HZyyRmV/+gdloUjjvRuX8ZkWwBGr2fY0ZaB8pc0cvG20z3slBaVAym72UVXjjsFcG5Y748lR8WmYyPSNAUYM6xXIzKuwXkqh36+UbtuW6SOtonqfnqt0UMDMBZY7N9lRyWcaFFi5d2KL1xW0R0CGLss9u+iJ7+dqmHBtf7HwXd+VfGdAhibLPZvTtMHYIqoADBFVACYIioAzm1U3ExGTiopN5uVstkT1yu0c+dOvf7663JdTucHRmPTpk3avHnzqLZhsX+XHZWe9nXqXDJF/a+1q7f9wZHrbzxX1jbS6bReeuklLVmyRNOnT9fSpUv15ptvlvtQABzV1dWlq666SvPnz9fcuXP15JNPamBgQOdi/y77JeXmlav9S6Wzkg0bNuiJJ57wByEUCh2foaRSqYq2CUDKerMKZ+T09S1btuj222/XXXfdpVtuuUV33HGH5s2bd8b377O2ppLJZk6alaxdu9YPyrGBAGDr2A/qoaEhbdy40Z+9zJkzx5+9DA4O6kwLuDkWM770XwdNNt7d3aX00/+ogVef9WclhSJyxRVXlFxTnF09PT168cUXNWvWLC1YsIDhH4MOHTqk559/Pu/nA0ffibFmxuWKff9napk0yeR+X5wb1+/Hw5Uf/nQsnFD0ay57q/f49d6eXqm/v6RZyXvvvedfMHZ98MEH/gXnH/fYMkMyqVRfX86olLt/51NWVErZ4OddeOGFap4zR1v/+2UFg8GCYXnooYfU1tZW1vZxdnz44Ye68cYbtXz5ct17770M+xjU1dWlxYsX5/18OBxWJpPRl1tb1ZNnllLu/p33vir9xqFtm7T7L5cq8e01Sqz4Xs6vqW9o0Oq7V+ur312pp556So8//rj279+f81BoxowZmj17dqUPB2dBc3Mz/4/GqH379uU85PFmKPX19VqxYoW/YBue3qob3h8w2b9NF2pdx1HX+nsUmzm3pK+/+OKLdf/992vPnj16+eWX/UVb7z/YiwsAW96sxOO9tOz9MPd+kD/yyCMl/0Aod/8+7f4r+aa+V55RbNZ8OYMDZf/HXn/99f7l008/PT578RYCW1paKnkoACTFYjE1NTX554B5sxLvJWXvFZ+zuX9XPFPJ9vXq4AuPKbFyjUbj87MX7/yVK6+8clTbA6pZU1OTOjs7j89KKg2Kxf5d9kyle8MDGr/8ToXijRXf6UkPIBz2F3QBjE4ikRjlFmz277JmKsmOrUpuf1+N162o+A4BjE1W+3dZM5WhLW9reHendixv9W87h/ulUFjpzz7RpDXrR/VAAJxbVvt3WVFpXHar4m03HL994IdrFJk8VRNuXlXOZgCMQVb7d1lRCcbq/Mvx29FaBWvrzdZXAJw7Vvt3xSe/eTjkAb64JlW4pME7vwEwRVQAnPmoREZ+S9pElGwBY0ZN0HDn9vfv07eXc5e/uNauBBcZbgvA6EyOBhU26oq3nUnREqPy9eYakzttjgQ0b9yo1oIBGGoIB/TVJpt98qqmsOLhYGlR+daUqC4Z5QzD++7V02sVPPqOUwDGhlXTajVulNOVeCigv5lWm/NzOcvRXBNU++VxfWty1J9tlMN7rF8bH9ZjrQ1a1hKt7BEDOGNa42E9e3mDlk2sUX2Z7z7ifb33fe2zG/zt5JJ3HnRBNKi/n1Gn+y6t1UDGVXLkjboLCgXkFzBivBgEwNashrDWzQwr69apP+MqXcL+HQmO7N+hIkcfRQ+uvDdTGhcJaFxZDxnA+cALxHjLl3s5TwWANV7vBWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMBW23Ry+KDZt2qQDBw741z/66CP/4+bNm/Xqq6/616PRqBYvXnxOHyPGpoDruu65fhAYG7Kuq//py+h/93brL/7qr+VkswW//u9Wr9biK+foD5oiioUCZ+1xYmwjKvC9sj+lf9oxpJ60K7muPv74Y2WLRGX69OkKRyKqDUp/dmFMqy6pZTTBmgqk/+wZ1t9+eGQkKJ5AQI1NTQWHpq6+3g+KZ8iRHt2d1CO7hhhOEBVI7XtTck4ZiKbGxoJD05QjOj/xtsPRdNXj1Z8ql3Zc/fpg5rR/92Yh3mwkl1AopIYcn+tOu/q/w4UPmfDFR1SqXF/GVbaM2YjHPzQK5F6Y7R1m3b/aEZUq5xRogDcb8WYlKuPQKCuiUu2ICvILBBR955fS92+V7vwj6RcbTlqgBXIhKigo/ntTpWW3SfOuLnhIBBxDVFBQ46Jvqu4PvyHVNSgQDORcoAU+j6igqGOzk2gslneBFjiG3/1BUQ0NDf4rPqG6OkYLRTFTAWCKqAAwRVRQkJvJyEkl5Xq/XJjNnrgO5EFUUFBP+zp1Lpmi/tfa1dv+4Mj1N55j1JAXb31Q5Q6kHC14t89se4+21qstUWO2PZx/mKkAMEVUAJjiPBXk1bFwQtHRueytXkYQJyEqyItgoBIc/qAkQ9s2qWNRQj3PrmPEUBBRQVGu46hr/T2KzZzLaKEoDn9QVN8rzyg2a76cwQFGC0UxU0FB2b5eHXzhMSVWrmGkUBKigoK6Nzyg8cvvVChe+N31gWOICvJKdmxVcvv7arxuBaOEkrGmgryGtryt4d2d2rG81b/tHO6XQmGlP/tEk9asZ+SQE1FBXo3LblW87Ybjtw/8cI0ik6dqws2rGDXkRVSQVzBW51+O347WKlhbz/oKCiIqKBmHPCgFC7UATBEVAKaISpWLGT8DokH+hEe1IypVLh4OqDliF4Lptaf/7WVUF6JS5QKBgJY027z94+x4SJOtpz447/AMgL5zcUyX1I7uqRAPBXTfpfyxMfDG1ziqe9jRT/am9G/dw9o95GjYLT403oHOxGhAbRNq9CdToppRz6EPiAoAYxz+ADBFVACYIioATBEVALL0/wS1Td+LmKNVAAAAAElFTkSuQmCC", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAARUAAAEnCAYAAACHXNdEAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAD4NJREFUeJzt3X+Q03V+x/FXfuwm+yPsQlYWsAIy4sFtT37pTL2WA3eEO6p4Mw4zWq9iubFqb9pK524qVO2MPTvlD7T27vAn4unO3Y2O1vPUG+s5bW3HOWUsFo6euK4IghywP2B3WTbZJN9v55vlh2CSTbJvSGKej5nMJkvyzQfN97mf7yffDT7XdV0BgBG/1YYAwENUAJgiKgBMBW03h0o3kHD0u7ijkTxW2gKSJof8aqnlZxNOIypI2zmY1D/vGdavjySVKvCxX2kM6M8vCuvrF9Seo9Ghkvh49we/HUxq9Y5jGkwV/1Lw5iob5zTomsmEpdoxb4We3B8bV1A8jqQf7B02GxMqF1GpcinX1b/3Jky2tWfYUddQoQdP+KIhKlVuIOnquDfNMOIt8qK6EZUqlzBuQIIluqpHVJDT0Z9v0d5bl6qzfbJ6ntpQ6uGgAhAV5BSMtiq65i5Flqws9VBQIThPBTk1Lr4m/XXo7V+VeiioEMxUAJgiKgBMERUApogKAFNEBTm5yaSceExuKiWlUqevA1kQFeTU27FRXcunaeDVDvV1PDB6/fVnSz0slDF+S7nKHY47WvxOv9n2HmlrUHuU31SuZsxUAJgiKgBMcUYtsupcMmnM+1z6Zt95GQsqB1FBVgQDxeDwB3kZ3rlVnUuj6n1mY6mHgjJHVDAm13HUveluhecsKPVQUAE4/MGY+l9+WuG5i+QMDZZ6KKgAzFSQU6q/T0eef1TRNetLPRRUCKKCnHo236+Jq+5QINJU6qGgQhAVZBXr3KHYrvfUdO3qUg8FFYQ1FWQ1vP0tjezr0u5VbenbzrEBKRBU4tOPNWX9plIPD2WKqCCrppW3KNJ+vQYGBtTT3a2af/2RGi/+kibdtLbUQ0MZIyrIyh+uT18GB4elpqgS8ssXrmd9BTkRFeQUj8UUj8dHb6y5VzWtraUeEsocC7XI6Wj/0TNvHz3zNnA2ooKsHMfRQP/AGd/zZi3e7AXIhqhUuZocr4DBgQFl+gyvs2cvZ2zP57MaGioUUalyE4I+NQQy/1m2Qx1v9uLNYjKZGuIlVe14BVS5gM+nqybV5F6gPYs3e/FmMWebWefXJdkKhapBVKBbLwqnZyz5HuJkmsV4L6S1M+vOyfhQWYgKNLcxqB9/pVFfmxiU15ZMC7RnO7lg66VofiSgH3y5QSsu4AOvwafp4yzHkq5+e6hXy77xx+kzabPx+f167mc/1VULL1O0lp9NOI2oIKP+/n7FTrx1vHPnTl199dW68cYb9dBDD6W/FwgE1NLSUuJRohxxRi0yampqSl88Bw8eTH9tbm5WK2fUYgzMWwGYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmgmPd4UAspd3HHcUdd8yN+X1StMavtkhAAZ/PaowAzpEPh1L6XdxRIo/9u8bv09SQX7MbAsVFZcdAUv/QdVy/OZYqeKDRGp/+dFpI35lRV/BjAZx7vzw8oof2DGtvzCn4sTPCft05s07XTK7NPypevb79m2MaTI1dr0x6E67+ZW9MCVfpJwdQPl7vHtF3dw2p8JyM8kL0vV1DqvFJyy+ozW9N5acH4kUH5bN+vD+muMF2ANjZvD9WdFBOck5sJ5OMUfmPvhFZOO5I7/QnTbYFYPx6RxxtHyx8SSMTbzs9I05+UTkct5tddGd4UgClYb0/5h0Vm46NStAUoGxYr0YkXYPzVI7+fIv23rpUne2T1fPUBqOhASgHFvt3wVEJRlsVXXOXIktWFvWEAMqXxf495slvZ2tcfE3669Dbvyr6SQGUJ4v9m9P0AZgiKgBMERUApogKgNJGxU0m5cRjclMpKZU6fb1Ie/bs0WuvvSbX5XR+YDy2bt2qbdu2jWsbFvt3wVHp7dioruXTNPBqh/o6Hhi9/vqzBW0jkUjoxRdf1PLlyzVr1iytWLFCb7zxRqFDAXBCd3e3rrzySi1atEgLFizQE088ocHBQZVi/y74LeWWNevSl2JnJZs3b9bjjz+e/o8QCAROzVDi8XhR2wQgpbxZhTN6+vr27dt122236c4779TNN9+s22+/XQsXLjzn+/d5W1NJppJnzEo2bNiQDsrJ/xAAbJ38QT08PKwtW7akZy/z589Pz16GhoZ0rvncDIsZX/qvIyYb7+npVuKpf9TgK8+kZyW5InL55ZfnXVOcX729vXrhhRc0d+5cLV68uNTDQQZHjx7Vc889p2x8Jz6JsXb2ZQp//2dqnTJFFl5YENHvR4LFH/50Lpk05n0ufbPv1PW+3j5pYCCvWcm7776bvqB8vf/+++kLKo97cpkhFlO8vz9jVArdv7MpKCr5bPCzLrzwQrXMn68d//2S/H5/zrA8+OCDam9vL2j7OD8++OAD3XDDDVq1apXuueeeUg8HGXhLCsuWLVM2wWBQyWRSX25rU2+WWUqh+3fW5yr2gcM7t2rfX65Q9NvrFV39vYz3aWhs1Lq71umr312jJ598Uo899pgOHTqU8VBo9uzZmjdvXrHDwXnQ0tLC/6MydfDgwYyHPN4MpaGhQatXr04v2AZnten69wZN9m/ThVrXcdS96W6F5yzI6/7Tp0/Xfffdp/379+ull15KL9p6f2EvLgBsebMSj/fWsvfD3PtB/vDDD+f9A6HQ/ftzz1/Mg/pfflrhuYvkDA0W/Je97rrr0pdPPvnk1OzFWwhsbW0tZigAJIXDYTU3N6fPAfNmJd5byt47Pudz/y56ppLq79OR5x9VdM16jcdnZy/e+StXXHHFuLYHVLPm5mZ1dXWdmpUUGxSL/bvgmUrP5vs1cdUdCkSain7SMwYQDKYXdAGMTzQaHecWbPbvgmYqsc4diu16T03Xri76CQGUJ6v9u6CZyvD2tzSyr0u7V7WlbzvHBqRAUIlPP9aU9ZvGNRAApWW1fxcUlaaVtyjSfv2p24d/uF41U2do0k1rC9kMgDJktX8XFBV/uD59OXU7VCd/XYPZ+gqA0rHav4s++c3DIQ/wxTWlyP2bT34DYIqoADj3UakZ/S1pEyGyBZSNWr/hzp3evz+/vYy7/PQ6uxJcZLgtAOMzNeRX0Kgr3namhPKMytdbak2etKXGp4UTxrUWDMBQY9Cnrzbb7JNXNgcVCfrzi8q3poV08ThnGN6j182qk//EJ04BKA9rZ9ZpwjinK5GAT38zsy7jn2UsR0utXx2XRfStqaH0bKMQ3li/NjGoR9satbI1VNyIAZwzbZGgnrmsUSsn16qhwE8f8e7vPa5jXmN6O5lknQddEPLr72fX695L6jSYdBUb/aDunAI+pQtYY7wYBMDW3MagNs4JKuXWayDpKpHH/l3jH92/A2McfYx5cOV9mNKEGp8mFDRkAJXAC8REy7d7OU8FgDWiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYIqoADBFVACYIioATBEVAKaICgBTRAWAKaICwBRRAWCKqAAwRVQAmCIqAEwRFQCmiAoAU0QFgCmiAsAUUQFgiqgAMEVUAJgiKgBMERUApogKAFNEBYApogLAFFEBYCpouzl8UWzdulWHDx9OX//www/TX7dt26ZXXnklfT0UCmnZsmUlHSPKk891XbfUg0B5SLmu/qc/qf890KO/+Ku/lpNK5bz/361bp2VXzNcfNNcoHPCdt3GivBEVpL18KK5/2j2s3oQrua4++ugjpcaIyqxZsxSsqVGdX/qzC8Nae3HdeRsvyhdrKtB/9o7obz84PhoUj8+npubmnI+pb2hIB8Uz7EiP7Ivp4b3D52O4KHNEBeo4EJdz1veam5pyPqY5Q3R+4m2HiW/VIypVLuG4+vWR5Oe+781CvNlIJoFAQI0Z/qwn4er/juU+ZMIXH1Gpcv1JV6kCZiOe9KGRL/PCbN8IM5VqR1SqnJOjAd5sxJuVqIBDo5SISrUjKsjO51Po7V9K379FuuOPpF9sPmOBFsiEqCCnyO/NkFbeKi28KuchEXASUUFOTUu/qfo//IZU3yif35dxgRb4LKKCMZ2cnYTC4awLtMBJ/O4PxtTY2Jh+xydQX1/qoaACMFMBYIqoADBFVJCTm0zKicfker9cmEqdvg5kQVSQU2/HRnUtn6aBVzvU1/HA6PXXny31sFDG+OiDKnc47mjxO/1m23ukrUHt0Vqz7aHyMFMBYIqoADDFeSrIqnPJpDHvc+mbfedlLKgcRAVZEQwUg8Mf5GV451Z1Lo2q95mNpR4KyhxRwZhcx1H3prsVnrOg1ENBBeDwB2Pqf/lphecukjM0WOqhoAIwU0FOqf4+HXn+UUXXrC/1UFAhiApy6tl8vyauukOBSO5P1wdOIirIKta5Q7Fd76np2tWlHgoqCGsqyGp4+1sa2del3ava0redYwNSIKjEpx9ryvpNpR4eyhRRQVZNK29RpP36U7cP/3C9aqbO0KSb1pZ0XChvRAVZ+cP16cup26E6+esaWF9BTkQFeeOQB/lgoRaAKaICwBRRqXJh41dAyM8/4VHtiEqViwR9aqmxC8Gsus//28uoLkSlyvl8Pi1vsfn4x3mRgKZaT31QcXgFQN+ZHtbFdeN7KUQCPt17Cf/YGPjga5zQM+LoJwfi+reeEe0bdjSSx6vCO9CZHPKpfVKt/mRaSLMbOPQBUQFgjMMfAKaICgBTRAWAKaICQJb+HwS1Td+bGZaOAAAAAElFTkSuQmCC", "text/plain": [ "<Figure size 300x300 with 1 Axes>" ] @@ -147,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -165,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -175,14 +172,14 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "548 μs ± 34.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "448 μs ± 72.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -212,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -229,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -238,7 +235,7 @@ "sympy.core.symbol.Symbol" ] }, - "execution_count": 47, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -272,7 +269,7 @@ "x â‹…(x + y + 5) + x " ] }, - "execution_count": 48, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -291,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -305,7 +302,7 @@ "x + x â‹…y + 6â‹…x " ] }, - "execution_count": 49, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -330,7 +327,7 @@ "x â‹…(x + y + 6)" ] }, - "execution_count": 50, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -355,7 +352,7 @@ "x â‹…(x + cos(x) + 5) + x " ] }, - "execution_count": 51, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -373,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -387,7 +384,7 @@ "x â‹…(x + y + 5) + x = 1" ] }, - "execution_count": 52, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -399,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -415,7 +412,7 @@ "⎣ x ⎦" ] }, - "execution_count": 53, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -447,7 +444,7 @@ "x â‹…(x + y + 5) + x " ] }, - "execution_count": 54, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -458,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -467,7 +464,7 @@ "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n", "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n", " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n", - "<!-- Generated by graphviz version 11.0.0 (0)\n", + "<!-- Generated by graphviz version 12.1.2 (0)\n", " -->\n", "<!-- Pages: 1 -->\n", "<svg width=\"422pt\" height=\"260pt\"\n", @@ -616,10 +613,10 @@ "</svg>\n" ], "text/plain": [ - "<graphviz.sources.Source at 0x7e3154f58d30>" + "<graphviz.sources.Source at 0x7fd4d0f809d0>" ] }, - "execution_count": 55, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -647,7 +644,7 @@ "sympy.core.add.Add" ] }, - "execution_count": 56, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -658,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -672,7 +669,7 @@ "âŽx , x â‹…(x + y + 5)⎠" ] }, - "execution_count": 57, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -715,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -728,7 +725,7 @@ "f_E__1" ] }, - "execution_count": 59, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -747,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -756,7 +753,7 @@ "True" ] }, - "execution_count": 60, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -776,7 +773,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -785,7 +782,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -802,7 +799,7 @@ "_W__2â‹…wâ‚‚) " ] }, - "execution_count": 62, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -824,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -841,7 +838,7 @@ " img_W__2â‹…wâ‚‚) " ] }, - "execution_count": 63, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -860,7 +857,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -877,7 +874,7 @@ "g_SW__2 - img_W__2â‹…wâ‚‚) " ] }, - "execution_count": 64, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -897,14 +894,16 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/media/data/fhennig/research-hpc/projects/2024_pystencils_nbackend/pystencils/src/pystencils/config.py:327: FutureWarning: The `cpu_openmp` option of CreateKernelConfig is deprecated and will be removed in pystencils 2.1. Use `cpu_optim.openmp` instead.\n", + "/media/data/fhennig/research-hpc/projects/2024_pystencils_nbackend/pystencils/src/pystencils/codegen/config.py:633: FutureWarning: The `cpu_openmp` option of CreateKernelConfig is deprecated and will be removed in pystencils 2.1. Use `cpu_optim.openmp` instead.\n", + " warn(\n", + "/media/data/fhennig/research-hpc/projects/2024_pystencils_nbackend/pystencils/src/pystencils/codegen/config.py:543: UserWarning: Setting the deprecated `cpu_openmp` option will override any options passed in the `cpu.openmp` category.\n", " warn(\n" ] } @@ -925,15 +924,18 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 30, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "No requests or imageio installed\n" - ] + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -953,12 +955,12 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "<Figure size 640x480 with 1 Axes>" ] @@ -985,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1002,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1016,9 +1018,9 @@ ".highlight .hll { background-color: #ffffcc }\n", ".highlight { background: #f8f8f8; }\n", ".highlight .c { color: #3D7B7B; font-style: italic } /* Comment */\n", - ".highlight .err { border: 1px solid #FF0000 } /* Error */\n", + ".highlight .err { border: 1px solid #F00 } /* Error */\n", ".highlight .k { color: #008000; font-weight: bold } /* Keyword */\n", - ".highlight .o { color: #666666 } /* Operator */\n", + ".highlight .o { color: #666 } /* Operator */\n", ".highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */\n", ".highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */\n", ".highlight .cp { color: #9C6500 } /* Comment.Preproc */\n", @@ -1035,34 +1037,34 @@ ".highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */\n", ".highlight .gs { font-weight: bold } /* Generic.Strong */\n", ".highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */\n", - ".highlight .gt { color: #0044DD } /* Generic.Traceback */\n", + ".highlight .gt { color: #04D } /* Generic.Traceback */\n", ".highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */\n", ".highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */\n", ".highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */\n", ".highlight .kp { color: #008000 } /* Keyword.Pseudo */\n", ".highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */\n", ".highlight .kt { color: #B00040 } /* Keyword.Type */\n", - ".highlight .m { color: #666666 } /* Literal.Number */\n", + ".highlight .m { color: #666 } /* Literal.Number */\n", ".highlight .s { color: #BA2121 } /* Literal.String */\n", ".highlight .na { color: #687822 } /* Name.Attribute */\n", ".highlight .nb { color: #008000 } /* Name.Builtin */\n", - ".highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */\n", - ".highlight .no { color: #880000 } /* Name.Constant */\n", - ".highlight .nd { color: #AA22FF } /* Name.Decorator */\n", + ".highlight .nc { color: #00F; font-weight: bold } /* Name.Class */\n", + ".highlight .no { color: #800 } /* Name.Constant */\n", + ".highlight .nd { color: #A2F } /* Name.Decorator */\n", ".highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */\n", ".highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */\n", - ".highlight .nf { color: #0000FF } /* Name.Function */\n", + ".highlight .nf { color: #00F } /* Name.Function */\n", ".highlight .nl { color: #767600 } /* Name.Label */\n", - ".highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */\n", + ".highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */\n", ".highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */\n", ".highlight .nv { color: #19177C } /* Name.Variable */\n", - ".highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */\n", - ".highlight .w { color: #bbbbbb } /* Text.Whitespace */\n", - ".highlight .mb { color: #666666 } /* Literal.Number.Bin */\n", - ".highlight .mf { color: #666666 } /* Literal.Number.Float */\n", - ".highlight .mh { color: #666666 } /* Literal.Number.Hex */\n", - ".highlight .mi { color: #666666 } /* Literal.Number.Integer */\n", - ".highlight .mo { color: #666666 } /* Literal.Number.Oct */\n", + ".highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */\n", + ".highlight .w { color: #BBB } /* Text.Whitespace */\n", + ".highlight .mb { color: #666 } /* Literal.Number.Bin */\n", + ".highlight .mf { color: #666 } /* Literal.Number.Float */\n", + ".highlight .mh { color: #666 } /* Literal.Number.Hex */\n", + ".highlight .mi { color: #666 } /* Literal.Number.Integer */\n", + ".highlight .mo { color: #666 } /* Literal.Number.Oct */\n", ".highlight .sa { color: #BA2121 } /* Literal.String.Affix */\n", ".highlight .sb { color: #BA2121 } /* Literal.String.Backtick */\n", ".highlight .sc { color: #BA2121 } /* Literal.String.Char */\n", @@ -1077,12 +1079,12 @@ ".highlight .s1 { color: #BA2121 } /* Literal.String.Single */\n", ".highlight .ss { color: #19177C } /* Literal.String.Symbol */\n", ".highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */\n", - ".highlight .fm { color: #0000FF } /* Name.Function.Magic */\n", + ".highlight .fm { color: #00F } /* Name.Function.Magic */\n", ".highlight .vc { color: #19177C } /* Name.Variable.Class */\n", ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */</style>" + ".highlight .il { color: #666 } /* Literal.Number.Integer.Long */</style>" ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -1094,32 +1096,28 @@ { "data": { "text/html": [ - "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"p\">)</span>\n", + "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"p\">)</span>\n", "<span class=\"p\">{</span>\n", "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", - "<span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">])</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]);</span>\n", + "<span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">])</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]);</span>\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"p\">}</span>\n", "</pre></div>\n" ], "text/plain": [ - "FUNC_PREFIX void kernel (const int64_t _size_dst_0, const int64_t _size_dst_1, const int64_t _stride_dst_0, const int64_t _stride_dst_1, const int64_t _stride_img_0, const int64_t _stride_img_1, const int64_t _stride_img_2, double * const dst_data, double * const img_data, const double w_2)\n", + "FUNC_PREFIX void kernel (double * RESTRICT const _data_dst, double * RESTRICT const _data_img, const int64_t _size_dst_0, const int64_t _size_dst_1, const int64_t _stride_dst_0, const int64_t _stride_dst_1, const int64_t _stride_img_0, const int64_t _stride_img_1, const int64_t _stride_img_2, const double w_2)\n", "{\n", " for(int64_t ctr_0 = 1LL; ctr_0 < _size_dst_0 - 1LL; ctr_0 += 1LL)\n", " {\n", " for(int64_t ctr_1 = 1LL; ctr_1 < _size_dst_1 - 1LL; ctr_1 += 1LL)\n", " {\n", - " dst_data[ctr_0 * _stride_dst_0 + ctr_1 * _stride_dst_1] = (0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * img_data[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * img_data[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]) * (0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * img_data[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * img_data[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]);\n", + " _data_dst[ctr_0 * _stride_dst_0 + ctr_1 * _stride_dst_1] = (0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]) * (0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]);\n", " }\n", - "\n", " }\n", - "\n", "}" ] }, @@ -1140,7 +1138,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1154,9 +1152,9 @@ ".highlight .hll { background-color: #ffffcc }\n", ".highlight { background: #f8f8f8; }\n", ".highlight .c { color: #3D7B7B; font-style: italic } /* Comment */\n", - ".highlight .err { border: 1px solid #FF0000 } /* Error */\n", + ".highlight .err { border: 1px solid #F00 } /* Error */\n", ".highlight .k { color: #008000; font-weight: bold } /* Keyword */\n", - ".highlight .o { color: #666666 } /* Operator */\n", + ".highlight .o { color: #666 } /* Operator */\n", ".highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */\n", ".highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */\n", ".highlight .cp { color: #9C6500 } /* Comment.Preproc */\n", @@ -1173,34 +1171,34 @@ ".highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */\n", ".highlight .gs { font-weight: bold } /* Generic.Strong */\n", ".highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */\n", - ".highlight .gt { color: #0044DD } /* Generic.Traceback */\n", + ".highlight .gt { color: #04D } /* Generic.Traceback */\n", ".highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */\n", ".highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */\n", ".highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */\n", ".highlight .kp { color: #008000 } /* Keyword.Pseudo */\n", ".highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */\n", ".highlight .kt { color: #B00040 } /* Keyword.Type */\n", - ".highlight .m { color: #666666 } /* Literal.Number */\n", + ".highlight .m { color: #666 } /* Literal.Number */\n", ".highlight .s { color: #BA2121 } /* Literal.String */\n", ".highlight .na { color: #687822 } /* Name.Attribute */\n", ".highlight .nb { color: #008000 } /* Name.Builtin */\n", - ".highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */\n", - ".highlight .no { color: #880000 } /* Name.Constant */\n", - ".highlight .nd { color: #AA22FF } /* Name.Decorator */\n", + ".highlight .nc { color: #00F; font-weight: bold } /* Name.Class */\n", + ".highlight .no { color: #800 } /* Name.Constant */\n", + ".highlight .nd { color: #A2F } /* Name.Decorator */\n", ".highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */\n", ".highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */\n", - ".highlight .nf { color: #0000FF } /* Name.Function */\n", + ".highlight .nf { color: #00F } /* Name.Function */\n", ".highlight .nl { color: #767600 } /* Name.Label */\n", - ".highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */\n", + ".highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */\n", ".highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */\n", ".highlight .nv { color: #19177C } /* Name.Variable */\n", - ".highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */\n", - ".highlight .w { color: #bbbbbb } /* Text.Whitespace */\n", - ".highlight .mb { color: #666666 } /* Literal.Number.Bin */\n", - ".highlight .mf { color: #666666 } /* Literal.Number.Float */\n", - ".highlight .mh { color: #666666 } /* Literal.Number.Hex */\n", - ".highlight .mi { color: #666666 } /* Literal.Number.Integer */\n", - ".highlight .mo { color: #666666 } /* Literal.Number.Oct */\n", + ".highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */\n", + ".highlight .w { color: #BBB } /* Text.Whitespace */\n", + ".highlight .mb { color: #666 } /* Literal.Number.Bin */\n", + ".highlight .mf { color: #666 } /* Literal.Number.Float */\n", + ".highlight .mh { color: #666 } /* Literal.Number.Hex */\n", + ".highlight .mi { color: #666 } /* Literal.Number.Integer */\n", + ".highlight .mo { color: #666 } /* Literal.Number.Oct */\n", ".highlight .sa { color: #BA2121 } /* Literal.String.Affix */\n", ".highlight .sb { color: #BA2121 } /* Literal.String.Backtick */\n", ".highlight .sc { color: #BA2121 } /* Literal.String.Char */\n", @@ -1215,12 +1213,12 @@ ".highlight .s1 { color: #BA2121 } /* Literal.String.Single */\n", ".highlight .ss { color: #19177C } /* Literal.String.Symbol */\n", ".highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */\n", - ".highlight .fm { color: #0000FF } /* Name.Function.Magic */\n", + ".highlight .fm { color: #00F } /* Name.Function.Magic */\n", ".highlight .vc { color: #19177C } /* Name.Variable.Class */\n", ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */</style>" + ".highlight .il { color: #666 } /* Literal.Number.Integer.Long */</style>" ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -1232,34 +1230,30 @@ { "data": { "text/html": [ - "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"p\">)</span>\n", + "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"p\">)</span>\n", "<span class=\"p\">{</span>\n", "<span class=\"w\"> </span><span class=\"cp\">#pragma omp parallel for schedule(static) num_threads(2)</span>\n", "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"n\">_size_dst_0</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"n\">_size_dst_1</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", - "<span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">])</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">img_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]);</span>\n", + "<span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_dst_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">])</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"mf\">0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-0.5</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">w_2</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_img</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_stride_img_2</span><span class=\"p\">]);</span>\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"p\">}</span>\n", "</pre></div>\n" ], "text/plain": [ - "FUNC_PREFIX void kernel (const int64_t _size_dst_0, const int64_t _size_dst_1, const int64_t _stride_dst_0, const int64_t _stride_dst_1, const int64_t _stride_img_0, const int64_t _stride_img_1, const int64_t _stride_img_2, double * const dst_data, double * const img_data, const double w_2)\n", + "FUNC_PREFIX void kernel (double * RESTRICT const _data_dst, double * RESTRICT const _data_img, const int64_t _size_dst_0, const int64_t _size_dst_1, const int64_t _stride_dst_0, const int64_t _stride_dst_1, const int64_t _stride_img_0, const int64_t _stride_img_1, const int64_t _stride_img_2, const double w_2)\n", "{\n", " #pragma omp parallel for schedule(static) num_threads(2)\n", " for(int64_t ctr_0 = 1LL; ctr_0 < _size_dst_0 - 1LL; ctr_0 += 1LL)\n", " {\n", " for(int64_t ctr_1 = 1LL; ctr_1 < _size_dst_1 - 1LL; ctr_1 += 1LL)\n", " {\n", - " dst_data[ctr_0 * _stride_dst_0 + ctr_1 * _stride_dst_1] = (0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * img_data[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * img_data[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]) * (0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] - 0.5 * img_data[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * img_data[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * img_data[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]);\n", + " _data_dst[ctr_0 * _stride_dst_0 + ctr_1 * _stride_dst_1] = (0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]) * (0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + 1LL) * _stride_img_1 + 2LL * _stride_img_2] + -0.5 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + (ctr_1 + -1LL) * _stride_img_1 + 2LL * _stride_img_2] + w_2 * _data_img[(ctr_0 + 1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2] - w_2 * _data_img[(ctr_0 + -1LL) * _stride_img_0 + ctr_1 * _stride_img_1 + 2LL * _stride_img_2]);\n", " }\n", - "\n", " }\n", - "\n", "}" ] }, @@ -1270,8 +1264,8 @@ "source": [ "ast = ps.create_kernel(\n", " update_rule,\n", - " cpu_optim = ps.CpuOptimConfig(\n", - " openmp=ps.OpenMpConfig(num_threads=2))\n", + " cpu = ps.CpuOptions(\n", + " openmp=ps.OpenMpOptions(enable=True, num_threads=2))\n", " )\n", "\n", "ps.show_code(ast)" @@ -1289,7 +1283,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1303,9 +1297,9 @@ ".highlight .hll { background-color: #ffffcc }\n", ".highlight { background: #f8f8f8; }\n", ".highlight .c { color: #3D7B7B; font-style: italic } /* Comment */\n", - ".highlight .err { border: 1px solid #FF0000 } /* Error */\n", + ".highlight .err { border: 1px solid #F00 } /* Error */\n", ".highlight .k { color: #008000; font-weight: bold } /* Keyword */\n", - ".highlight .o { color: #666666 } /* Operator */\n", + ".highlight .o { color: #666 } /* Operator */\n", ".highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */\n", ".highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */\n", ".highlight .cp { color: #9C6500 } /* Comment.Preproc */\n", @@ -1322,34 +1316,34 @@ ".highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */\n", ".highlight .gs { font-weight: bold } /* Generic.Strong */\n", ".highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */\n", - ".highlight .gt { color: #0044DD } /* Generic.Traceback */\n", + ".highlight .gt { color: #04D } /* Generic.Traceback */\n", ".highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */\n", ".highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */\n", ".highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */\n", ".highlight .kp { color: #008000 } /* Keyword.Pseudo */\n", ".highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */\n", ".highlight .kt { color: #B00040 } /* Keyword.Type */\n", - ".highlight .m { color: #666666 } /* Literal.Number */\n", + ".highlight .m { color: #666 } /* Literal.Number */\n", ".highlight .s { color: #BA2121 } /* Literal.String */\n", ".highlight .na { color: #687822 } /* Name.Attribute */\n", ".highlight .nb { color: #008000 } /* Name.Builtin */\n", - ".highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */\n", - ".highlight .no { color: #880000 } /* Name.Constant */\n", - ".highlight .nd { color: #AA22FF } /* Name.Decorator */\n", + ".highlight .nc { color: #00F; font-weight: bold } /* Name.Class */\n", + ".highlight .no { color: #800 } /* Name.Constant */\n", + ".highlight .nd { color: #A2F } /* Name.Decorator */\n", ".highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */\n", ".highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */\n", - ".highlight .nf { color: #0000FF } /* Name.Function */\n", + ".highlight .nf { color: #00F } /* Name.Function */\n", ".highlight .nl { color: #767600 } /* Name.Label */\n", - ".highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */\n", + ".highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */\n", ".highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */\n", ".highlight .nv { color: #19177C } /* Name.Variable */\n", - ".highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */\n", - ".highlight .w { color: #bbbbbb } /* Text.Whitespace */\n", - ".highlight .mb { color: #666666 } /* Literal.Number.Bin */\n", - ".highlight .mf { color: #666666 } /* Literal.Number.Float */\n", - ".highlight .mh { color: #666666 } /* Literal.Number.Hex */\n", - ".highlight .mi { color: #666666 } /* Literal.Number.Integer */\n", - ".highlight .mo { color: #666666 } /* Literal.Number.Oct */\n", + ".highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */\n", + ".highlight .w { color: #BBB } /* Text.Whitespace */\n", + ".highlight .mb { color: #666 } /* Literal.Number.Bin */\n", + ".highlight .mf { color: #666 } /* Literal.Number.Float */\n", + ".highlight .mh { color: #666 } /* Literal.Number.Hex */\n", + ".highlight .mi { color: #666 } /* Literal.Number.Integer */\n", + ".highlight .mo { color: #666 } /* Literal.Number.Oct */\n", ".highlight .sa { color: #BA2121 } /* Literal.String.Affix */\n", ".highlight .sb { color: #BA2121 } /* Literal.String.Backtick */\n", ".highlight .sc { color: #BA2121 } /* Literal.String.Char */\n", @@ -1364,12 +1358,12 @@ ".highlight .s1 { color: #BA2121 } /* Literal.String.Single */\n", ".highlight .ss { color: #19177C } /* Literal.String.Symbol */\n", ".highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */\n", - ".highlight .fm { color: #0000FF } /* Name.Function.Magic */\n", + ".highlight .fm { color: #00F } /* Name.Function.Magic */\n", ".highlight .vc { color: #19177C } /* Name.Variable.Class */\n", ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */</style>" + ".highlight .il { color: #666 } /* Literal.Number.Integer.Long */</style>" ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -1381,32 +1375,28 @@ { "data": { "text/html": [ - "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">)</span>\n", + "<div class=\"highlight\"><pre><span></span><span class=\"n\">FUNC_PREFIX</span><span class=\"w\"> </span><span class=\"kt\">void</span><span class=\"w\"> </span><span class=\"n\">kernel</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">,</span><span class=\"w\"> </span><span class=\"kt\">double</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">RESTRICT</span><span class=\"w\"> </span><span class=\"k\">const</span><span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">)</span>\n", "<span class=\"p\">{</span>\n", - "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"mf\">81L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", + "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"mf\">202L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", - "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"mf\">289L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", + "<span class=\"w\"> </span><span class=\"k\">for</span><span class=\"p\">(</span><span class=\"kt\">int64_t</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\"><</span><span class=\"w\"> </span><span class=\"mf\">600L</span><span class=\"n\">L</span><span class=\"p\">;</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+=</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span>\n", "<span class=\"w\"> </span><span class=\"p\">{</span>\n", - "<span class=\"w\"> </span><span class=\"n\">dst_data</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">290L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">-1.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"mf\">2.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">I_data</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">1160L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">];</span>\n", + "<span class=\"w\"> </span><span class=\"n\">_data_dst</span><span class=\"p\">[</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">601L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">=</span><span class=\"w\"> </span><span class=\"mf\">-1.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">-</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">-2.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">2.0</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">]</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"n\">_data_I</span><span class=\"p\">[(</span><span class=\"n\">ctr_0</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">2404L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"p\">(</span><span class=\"n\">ctr_1</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mi\">-1LL</span><span class=\"p\">)</span><span class=\"w\"> </span><span class=\"o\">*</span><span class=\"w\"> </span><span class=\"mf\">4L</span><span class=\"n\">L</span><span class=\"w\"> </span><span class=\"o\">+</span><span class=\"w\"> </span><span class=\"mf\">1L</span><span class=\"n\">L</span><span class=\"p\">];</span>\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"w\"> </span><span class=\"p\">}</span>\n", - "\n", "<span class=\"p\">}</span>\n", "</pre></div>\n" ], "text/plain": [ - "FUNC_PREFIX void kernel (double * const I_data, double * const dst_data)\n", + "FUNC_PREFIX void kernel (double * RESTRICT const _data_I, double * RESTRICT const _data_dst)\n", "{\n", - " for(int64_t ctr_0 = 1LL; ctr_0 < 81LL; ctr_0 += 1LL)\n", + " for(int64_t ctr_0 = 1LL; ctr_0 < 202LL; ctr_0 += 1LL)\n", " {\n", - " for(int64_t ctr_1 = 1LL; ctr_1 < 289LL; ctr_1 += 1LL)\n", + " for(int64_t ctr_1 = 1LL; ctr_1 < 600LL; ctr_1 += 1LL)\n", " {\n", - " dst_data[ctr_0 * 290LL + ctr_1] = -1.0 * I_data[(ctr_0 + 1LL) * 1160LL + (ctr_1 + 1LL) * 4LL + 1LL] - I_data[(ctr_0 + -1LL) * 1160LL + (ctr_1 + 1LL) * 4LL + 1LL] - I_data[(ctr_0 + -1LL) * 1160LL + (ctr_1 + -1LL) * 4LL + 1LL] - 2.0 * I_data[(ctr_0 + -1LL) * 1160LL + ctr_1 * 4LL + 1LL] + 2.0 * I_data[(ctr_0 + 1LL) * 1160LL + ctr_1 * 4LL + 1LL] + I_data[(ctr_0 + 1LL) * 1160LL + (ctr_1 + -1LL) * 4LL + 1LL];\n", + " _data_dst[ctr_0 * 601LL + ctr_1] = -1.0 * _data_I[(ctr_0 + 1LL) * 2404LL + (ctr_1 + 1LL) * 4LL + 1LL] - _data_I[(ctr_0 + -1LL) * 2404LL + (ctr_1 + 1LL) * 4LL + 1LL] - _data_I[(ctr_0 + -1LL) * 2404LL + (ctr_1 + -1LL) * 4LL + 1LL] + -2.0 * _data_I[(ctr_0 + -1LL) * 2404LL + ctr_1 * 4LL + 1LL] + 2.0 * _data_I[(ctr_0 + 1LL) * 2404LL + ctr_1 * 4LL + 1LL] + _data_I[(ctr_0 + 1LL) * 2404LL + (ctr_1 + -1LL) * 4LL + 1LL];\n", " }\n", - "\n", " }\n", - "\n", "}" ] }, @@ -1443,7 +1433,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1486,7 +1476,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 2bb4aac3d..8c59f7846 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -4,7 +4,7 @@ from .codegen import ( Target, CreateKernelConfig, CpuOptions, - VectorizationConfig, + VectorizationOptions, OpenMpOptions, GpuOptions, AUTO @@ -51,7 +51,7 @@ __all__ = [ "make_slice", "CreateKernelConfig", "CpuOptions", - "VectorizationConfig", + "VectorizationOptions", "GpuOptions", "OpenMpOptions", "AUTO", diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index 031a0d843..f7425c06b 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -17,7 +17,6 @@ from ...types import PsStructType from ..exceptions import PsInputError, KernelConstraintsError if TYPE_CHECKING: - from ...codegen.config import _AUTO_TYPE from .context import KernelCreationContext @@ -62,6 +61,7 @@ class FullIterationSpace(IterationSpace): @dataclass class Dimension: """One dimension of a dense iteration space""" + start: PsExpression stop: PsExpression step: PsExpression @@ -196,7 +196,7 @@ class FullIterationSpace(IterationSpace): def dimensions(self): """The dimensions of this iteration space""" return self._dimensions - + @property def counters(self) -> tuple[PsSymbol, ...]: return tuple(dim.counter for dim in self._dimensions) @@ -220,7 +220,7 @@ class FullIterationSpace(IterationSpace): def archetype_field(self) -> Field | None: """Field whose shape and memory layout act as archetypes for this iteration space's dimensions.""" return self._archetype_field - + @property def loop_order(self) -> tuple[int, ...]: """Return the loop order of this iteration space, ordered from slowest to fastest coordinate.""" @@ -242,7 +242,7 @@ class FullIterationSpace(IterationSpace): self, dimension: int | FullIterationSpace.Dimension | None = None ) -> PsExpression: """Construct an expression representing the actual number of unique points inside the iteration space. - + Args: dimension: If an integer or a `Dimension` object is given, the number of iterations in that dimension is computed. If `None`, the total number of iterations inside the entire space @@ -417,14 +417,59 @@ def create_sparse_iteration_space( def create_full_iteration_space( ctx: KernelCreationContext, assignments: AssignmentCollection, - ghost_layers: None | _AUTO_TYPE | int | Sequence[int | tuple[int, int]] = None, + ghost_layers: None | int | Sequence[int | tuple[int, int]] = None, iteration_slice: None | int | slice | tuple[int | slice, ...] = None, + infer_ghost_layers: bool = False, ) -> IterationSpace: + """Create a dense iteration space from a sequence of assignments and iteration slice information. + + This function finds all accesses to fields in the given assignment collection, + analyzes the set of fields involved, + and determines the iteration space bounds from these. + This requires that either all fields are of the same, fixed, shape, or all of them are + variable-shaped. + Also, all fields need to have the same memory layout of their spatial dimensions. + + Args: + ctx: The kernel creation context + assignments: Collection of assignments the iteration space should be inferred from + ghost_layers: If set, strip off that many ghost layers from all sides of the iteration cuboid + iteration_slice: If set, constrain iteration to the given slice. + For details on the parsing of slices, see `AstFactory.parse_slice`. + infer_ghost_layers: If `True`, infer the number of ghost layers from the stencil ranges + used in the kernel. + + Returns: + IterationSpace: The constructed iteration space. + + Raises: + KernelConstraintsError: If field shape or memory layout conflicts are detected + ValueError: If the iteration slice could not be parsed + + .. attention:: + The ``ghost_layers`` and ``iteration_slice`` arguments are mutually exclusive. + Also, if ``infer_ghost_layers=True``, none of them may be set. + """ + assert not ctx.fields.index_fields - if (ghost_layers is not None) and (iteration_slice is not None): + if not ( + (ghost_layers is not None) + or (iteration_slice is not None) + or infer_ghost_layers + ): raise ValueError( - "At most one of `ghost_layers` and `iteration_slice` may be specified." + "One argument of `ghost_layers`, `iteration_slice`, and `infer_ghost_layers` must be set." + ) + + if ( + int(ghost_layers is not None) + + int(iteration_slice is not None) + + int(infer_ghost_layers) + > 1 + ): + raise ValueError( + "At most one of `ghost_layers`, `iteration_slice`, and `infer_ghost_layers` may be set." ) # Collect all relative accesses into domain fields @@ -457,9 +502,7 @@ def create_full_iteration_space( # Otherwise, if an iteration slice was specified, use that # Otherwise, use the inferred ghost layers - from ...codegen.config import AUTO, _AUTO_TYPE - - if ghost_layers is AUTO: + if infer_ghost_layers: if len(domain_field_accesses) > 0: inferred_gls = max( [fa.required_ghost_layers for fa in domain_field_accesses] @@ -472,7 +515,6 @@ def create_full_iteration_space( ctx, inferred_gls, archetype_field ) elif ghost_layers is not None: - assert not isinstance(ghost_layers, _AUTO_TYPE) ctx.metadata["ghost_layers"] = ghost_layers return FullIterationSpace.create_with_ghost_layers( ctx, ghost_layers, archetype_field diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 31686cb18..2559ac6d2 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -30,7 +30,7 @@ from ..literals import PsLiteral from ..functions import PsMathFunction, MathFunctions, CFunction if TYPE_CHECKING: - from ...codegen import GpuOptions, GpuThreadsRange + from ...codegen import GpuThreadsRange int32 = PsSignedIntegerType(width=32, const=False) @@ -52,13 +52,15 @@ class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs.""" def __init__( - self, ctx: KernelCreationContext, indexing_cfg: GpuOptions | None = None + self, ctx: KernelCreationContext, + omit_range_check: bool = False, + manual_launch_grid: bool = False, ) -> None: super().__init__(ctx) - from ...codegen.config import GpuOptions + self._omit_range_check = omit_range_check + self._manual_launch_grid = manual_launch_grid - self._cfg = indexing_cfg if indexing_cfg is not None else GpuOptions() self._typify = Typifier(ctx) @property @@ -141,7 +143,7 @@ class CudaPlatform(GenericGpu): ) -> tuple[PsBlock, GpuThreadsRange | None]: dimensions = ispace.dimensions_in_loop_order() - if not self._cfg.manual_launch_grid: + if not self._manual_launch_grid: try: threads_range = self.threads_from_ispace(ispace) except MaterializationError as e: @@ -170,7 +172,7 @@ class CudaPlatform(GenericGpu): ) ) ) - if not self._cfg.omit_range_check: + if not self._omit_range_check: conds.append(PsLt(ctr, dim.stop)) indexing_decls = indexing_decls[::-1] @@ -213,7 +215,7 @@ class CudaPlatform(GenericGpu): ] body.statements = mappings + body.statements - if not self._cfg.omit_range_check: + if not self._omit_range_check: stop = PsExpression.make(ispace.index_list.shape[0]) condition = PsLt(sparse_ctr, stop) ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index b5ba7b6c4..594c87b14 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -19,7 +19,7 @@ from ..ast.expressions import ( PsLe, PsTernary, PsLookup, - PsBufferAcc + PsBufferAcc, ) from ..extensions.cpp import CppMethodCall @@ -30,19 +30,21 @@ from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType if TYPE_CHECKING: - from ...codegen import GpuOptions, GpuThreadsRange + from ...codegen import GpuThreadsRange class SyclPlatform(GenericGpu): def __init__( - self, ctx: KernelCreationContext, indexing_cfg: GpuOptions | None = None + self, + ctx: KernelCreationContext, + omit_range_check: bool = False, + automatic_block_size: bool = False ): super().__init__(ctx) - from ...codegen.config import GpuOptions - - self._cfg = indexing_cfg if indexing_cfg is not None else GpuOptions() + self._omit_range_check = omit_range_check + self._automatic_block_size = automatic_block_size @property def required_headers(self) -> set[str]: @@ -138,7 +140,7 @@ class SyclPlatform(GenericGpu): indexing_decls.append( PsDeclaration(ctr, dim.start + work_item_idx * dim.step) ) - if not self._cfg.omit_range_check: + if not self._omit_range_check: conds.append(PsLt(ctr, dim.stop)) if conds: @@ -156,7 +158,7 @@ class SyclPlatform(GenericGpu): self, body: PsBlock, ispace: SparseIterationSpace ) -> tuple[PsBlock, GpuThreadsRange]: factory = AstFactory(self._ctx) - + id_type = PsCustomType("sycl::id< 1 >", const=True) id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type)) @@ -184,7 +186,7 @@ class SyclPlatform(GenericGpu): ] body.statements = mappings + body.statements - if not self._cfg.omit_range_check: + if not self._omit_range_check: stop = PsExpression.make(ispace.index_list.shape[0]) condition = PsLt(sparse_ctr, stop) ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) @@ -195,7 +197,7 @@ class SyclPlatform(GenericGpu): return ast, self.threads_from_ispace(ispace) def _item_type(self, rank: int): - if not self._cfg.sycl_automatic_block_size: + if not self._automatic_block_size: return PsCustomType(f"sycl::nd_item< {rank} >", const=True) else: return PsCustomType(f"sycl::item< {rank} >", const=True) @@ -207,7 +209,7 @@ class SyclPlatform(GenericGpu): item_type = self._item_type(rank) item = PsExpression.make(self._ctx.get_symbol("sycl_item", item_type)) - if not self._cfg.sycl_automatic_block_size: + if not self._automatic_block_size: rhs = CppMethodCall(item, "get_global_id", self._id_type(rank)) else: rhs = CppMethodCall(item, "get_id", self._id_type(rank)) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index b033e4d58..f44b89c72 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -1,6 +1,5 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING from typing import Sequence from collections import defaultdict @@ -10,8 +9,6 @@ from ..ast import PsAstNode from ..ast.structural import PsBlock, PsLoop, PsPragma from ..ast.expressions import PsExpression -if TYPE_CHECKING: - from ...codegen.config import OpenMpOptions __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"] @@ -105,19 +102,37 @@ class AddOpenMP: `OpenMpConfig` configuration. """ - def __init__(self, ctx: KernelCreationContext, omp_params: OpenMpOptions) -> None: + def __init__( + self, + ctx: KernelCreationContext, + nesting_depth: int = 0, + num_threads: int | None = None, + schedule: str | None = None, + collapse: int | None = None, + omit_parallel: bool = False, + ) -> None: pragma_text = "omp" - pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" - pragma_text += f" for schedule({omp_params.schedule})" - if omp_params.num_threads is not None: - pragma_text += f" num_threads({str(omp_params.num_threads)})" + if not omit_parallel: + pragma_text += " parallel" + + pragma_text += " for" + + if schedule is not None: + pragma_text += f" schedule({schedule})" + + if num_threads is not None: + pragma_text += f" num_threads({str(num_threads)})" - if omp_params.collapse > 0: - pragma_text += f" collapse({str(omp_params.collapse)})" + if collapse is not None: + if collapse <= 0: + raise ValueError( + f"Invalid value for OpenMP `collapse` clause: {collapse}" + ) + pragma_text += f" collapse({str(collapse)})" self._insert_pragmas = InsertPragmasAtLoops( - ctx, [LoopPragma(pragma_text, omp_params.nesting_depth)] + ctx, [LoopPragma(pragma_text, nesting_depth)] ) def __call__(self, node: PsAstNode) -> PsAstNode: diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py index da33f9ee2..3780527c6 100644 --- a/src/pystencils/codegen/__init__.py +++ b/src/pystencils/codegen/__init__.py @@ -2,7 +2,7 @@ from .target import Target from .config import ( CreateKernelConfig, CpuOptions, - VectorizationConfig, + VectorizationOptions, OpenMpOptions, GpuOptions, AUTO, @@ -15,7 +15,7 @@ __all__ = [ "Target", "CreateKernelConfig", "CpuOptions", - "VectorizationConfig", + "VectorizationOptions", "OpenMpOptions", "GpuOptions", "AUTO", diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 4e17ccc48..9abf51222 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -4,10 +4,9 @@ from typing import TYPE_CHECKING from warnings import warn from abc import ABC from collections.abc import Collection -from copy import copy from typing import Sequence, Generic, TypeVar, Callable, Any, cast -from dataclasses import dataclass, InitVar, replace, fields +from dataclasses import dataclass, InitVar, fields from .target import Target from ..field import Field, FieldType @@ -15,7 +14,6 @@ from ..field import Field, FieldType from ..types import ( PsIntegerType, UserTypeSpec, - PsIeeeFloatType, PsScalarType, create_type, ) @@ -26,10 +24,6 @@ if TYPE_CHECKING: from ..jit import JitBase -class PsOptionsError(Exception): - """Indicates an option clash in the `CreateKernelConfig`.""" - - Option_T = TypeVar("Option_T") Arg_T = TypeVar("Arg_T") @@ -51,6 +45,9 @@ class Option(Generic[Option_T, Arg_T]): Through the validator, options may also be set from arguments of a different type (`Arg_T`) than their value type (`Option_T`). If `Arg_T` is different from `Option_T`, the validator must perform the conversion from the former to the latter. + + .. note:: + `Arg_T` must always be a supertype of `Option_T`. """ def __init__( @@ -102,10 +99,32 @@ class Option(Generic[Option_T, Arg_T]): delattr(obj, self._lookup) -class BasicOption(Option[Option_T, Option_T]): ... +class BasicOption(Option[Option_T, Option_T]): ... # noqa: E701 class ConfigBase(ABC): + """Base class for configuration categories. + + This class implements query and retrieval mechanism for configuration options, + as well as deepcopy functionality for categories. + + Subclasses of `ConfigBase` must be `dataclasses`, + and all of their instance fields must have one of two descriptors types: + - Either `Option`, for scalar options; + - Or `Category` for option subcategories. + + `Option` fields must be assigned immutable values, but are otherwise unconstrained. + `Category` subobjects must be subclasses of `ConfigBase`. + + **Retrieval** Options set to `None` are considered *unset*, i.e. the user has not provided a value. + Through the `Option` descriptor, these options can still have a default value. + To retrieve either the user-set value if one exists, or the default value otherwise, use `get_option`. + + **Deep-Copy** When a configuration object is copied, all of its subcategories must be copied along with it, + such that changes in the original do no affect the copy, and vice versa. + Such a deep copy is performed by the `copy <ConfigBase.copy>` method. + """ + def get_option(self, name: str) -> Any: """Get the value set for the specified option, or the option's default value if none has been set.""" descr: Option = type(self).__dict__[name] @@ -125,6 +144,31 @@ class ConfigBase(ABC): if new_val is not None: setattr(self, f.name, new_val) + def copy(self): + """Perform a semi-deep copy of this configuration object. + + This will recursively copy any config subobjects + (categories, i.e. subclasses of `ConfigBase` wrapped in the `Category` descriptor) + nested in this configuration object. Any other fields will be copied by reference. + """ + + # IMPLEMENTATION NOTES + # + # We do not need to call `copy` on any subcategories here, since the `Category` + # descriptor already calls `copy` in its `__set__` method, + # which is invoked during the constructor call in the `return` statement. + # Calling `copy` here would result in copying category objects twice. + # + # We cannot use the standard library `copy.copy` here, since it merely duplicates + # the instance dictionary and does not call the constructor. + + config_fields = fields(self) # type: ignore + kwargs = dict() + for field in config_fields: + val = getattr(self, field.name) + kwargs[field.name] = val + return type(self)(**kwargs) + Category_T = TypeVar("Category_T", bound=ConfigBase) @@ -151,7 +195,7 @@ class Category(Generic[Category_T]): return cast(Category_T, getattr(obj, self._lookup, None)) def __set__(self, obj, cat: Category_T): - setattr(obj, self._lookup, copy(cat)) + setattr(obj, self._lookup, cat.copy()) class _AUTO_TYPE: ... # noqa: E701 @@ -191,15 +235,9 @@ class OpenMpOptions(ConfigBase): Use this option only if you intend to wrap the kernel into an external ``#pragma omp parallel`` region. """ - def __post_init__(self): - if self.omit_parallel_construct and self.num_threads is not None: - raise PsOptionsError( - "Cannot specify `num_threads` if `omit_parallel_construct` is set." - ) - @dataclass -class VectorizationConfig(ConfigBase): +class VectorizationOptions(ConfigBase): """Configuration for the auto-vectorizer. If any flag in this configuration is set to a value not supported by the CPU specified @@ -277,7 +315,7 @@ class CpuOptions(ConfigBase): """Options governing OpenMP-instrumentation. """ - vectorize: Category[OpenMpOptions] = Category(OpenMpOptions()) + vectorize: Category[VectorizationOptions] = Category(VectorizationOptions()) """Options governing intrinsic vectorization. """ @@ -294,14 +332,6 @@ class CpuOptions(ConfigBase): to produce cacheline zeroing instructions where possible. """ - def get_vectorization_config(self) -> VectorizationConfig | None: - if self.vectorize is True: - return VectorizationConfig() - elif isinstance(self.vectorize, VectorizationConfig): - return self.vectorize - else: - return None - @dataclass class GpuOptions(ConfigBase): @@ -326,8 +356,13 @@ class GpuOptions(ConfigBase): The launch grid will then have to be specified manually at runtime. """ - sycl_automatic_block_size: BasicOption[bool] = BasicOption(True) - """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size. + +@dataclass +class SyclOptions(ConfigBase): + """Options specific to the `SYCL <Target.SYCL>` target.""" + + automatic_block_size: BasicOption[bool] = BasicOption(True) + """If set to `True`, let the SYCL runtime decide on the block size. If set to `True`, the kernel is generated for execution via `parallel_for <https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_parallel_for_invoke>`_ @@ -431,12 +466,15 @@ class CreateKernelConfig(ConfigBase): """Target-Specific Options""" - cpu_optim: Category[CpuOptions] = Category(CpuOptions()) + cpu: Category[CpuOptions] = Category(CpuOptions()) """Options for CPU kernels.""" - gpu_indexing: Category[GpuOptions] = Category(GpuOptions()) + gpu: Category[GpuOptions] = Category(GpuOptions()) """Options for GPU Kernels.""" + sycl: Category[SyclOptions] = Category(SyclOptions()) + """Options for SYCL kernels.""" + @index_dtype.validate def validate_index_type(self, spec: UserTypeSpec): dtype = create_type(spec) @@ -496,11 +534,8 @@ class CreateKernelConfig(ConfigBase): try: from ..jit.gpu_cupy import CupyJit - if ( - self.gpu_indexing is not None - and self.gpu_indexing.block_size is not None - ): - return CupyJit(self.gpu_indexing.block_size) + if self.gpu is not None and self.gpu.block_size is not None: + return CupyJit(self.gpu.block_size) else: return CupyJit() @@ -533,8 +568,6 @@ class CreateKernelConfig(ConfigBase): cpu_vectorize_info: dict | None, gpu_indexing_params: dict | None, ): # pragma: no cover - optim: CpuOptions | None = None - if data_type is not None: _deprecated_option("data_type", "default_dtype") warn( @@ -546,27 +579,33 @@ class CreateKernelConfig(ConfigBase): if cpu_openmp is not None: _deprecated_option("cpu_openmp", "cpu_optim.openmp") + warn( + "Setting the deprecated `cpu_openmp` option will override any options " + "passed in the `cpu.openmp` category.", + UserWarning, + ) - deprecated_omp: OpenMpOptions | bool + deprecated_omp = OpenMpOptions() match cpu_openmp: case True: - deprecated_omp = OpenMpOptions() + deprecated_omp.enable = False case False: - deprecated_omp = False + deprecated_omp.enable = False case int(): - deprecated_omp = OpenMpOptions(num_threads=cpu_openmp) + deprecated_omp.enable = True + deprecated_omp.num_threads = cpu_openmp case _: - raise PsOptionsError( + raise ValueError( f"Invalid option for `cpu_openmp`: {cpu_openmp}" ) - optim = CpuOptions(openmp=deprecated_omp) + self.cpu.openmp = deprecated_omp if cpu_vectorize_info is not None: _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize") if "instruction_set" in cpu_vectorize_info: if self.target != Target.GenericCPU: - raise PsOptionsError( + raise ValueError( "Setting 'instruction_set' in the deprecated 'cpu_vectorize_info' option is only " "valid if `target == Target.CPU`." ) @@ -585,7 +624,7 @@ class CreateKernelConfig(ConfigBase): case "avx512vl": vec_target = Target.X86_AVX512 | Target._VL case _: - raise PsOptionsError( + raise ValueError( f'Value {isa} in `cpu_vectorize_info["instruction_set"]` is not supported.' ) @@ -598,7 +637,14 @@ class CreateKernelConfig(ConfigBase): self.target = vec_target - deprecated_vec_opts = VectorizationConfig( + warn( + "Setting the deprecated `cpu_vectorize_info` will override any options " + "passed in the `cpu.vectorize` category.", + UserWarning, + ) + + deprecated_vec_opts = VectorizationOptions( + enable=True, assume_inner_stride_one=cpu_vectorize_info.get( "assume_inner_stride_one", False ), @@ -606,28 +652,16 @@ class CreateKernelConfig(ConfigBase): use_nontemporal_stores=cpu_vectorize_info.get("nontemporal", False), ) - if optim is not None: - optim = replace(optim, vectorize=deprecated_vec_opts) - else: - optim = CpuOptions(vectorize=deprecated_vec_opts) - - if optim is not None: - if self.cpu_optim is not None: - raise PsOptionsError( - "Cannot specify both `cpu_optim` and a deprecated legacy optimization option at the same time." - ) - else: - self.cpu_optim = optim + self.cpu.vectorize = deprecated_vec_opts if gpu_indexing_params is not None: _deprecated_option("gpu_indexing_params", "gpu_indexing") + warn( + "Setting the deprecated `gpu_indexing_params` will override any options " + "passed in the `gpu` category." + ) - if self.gpu_indexing is not None: - raise PsOptionsError( - "Cannot specify both `gpu_indexing` and the deprecated `gpu_indexing_params` at the same time." - ) - - self.gpu_indexing = GpuOptions( + self.gpu = GpuOptions( block_size=gpu_indexing_params.get("block_size", None) ) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 47bcb905d..6f44e718d 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -3,7 +3,14 @@ from typing import cast, Sequence, Iterable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target -from .config import CreateKernelConfig, OpenMpOptions, VectorizationConfig, AUTO, GhostLayerSpec, IterationSliceSpec +from .config import ( + CreateKernelConfig, + VectorizationOptions, + AUTO, + _AUTO_TYPE, + GhostLayerSpec, + IterationSliceSpec, +) from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter @@ -115,15 +122,17 @@ class DefaultKernelCreationDriver: int(cfg.is_option_set("ghost_layers")) + int(cfg.is_option_set("iteration_slice")) + int(cfg.is_option_set("index_field")) - ) - + ) + if num_ispace_options_set > 1: raise ValueError( "At most one of the options 'ghost_layers' 'iteration_slice' and 'index_field' may be set." ) - + self._ghost_layers: GhostLayerSpec | None = cfg.get_option("ghost_layers") - self._iteration_slice: IterationSliceSpec | None = cfg.get_option("iteration_slice") + self._iteration_slice: IterationSliceSpec | None = cfg.get_option( + "iteration_slice" + ) self._index_field: Field | None = cfg.get_option("index_field") if num_ispace_options_set == 0: @@ -235,22 +244,26 @@ class DefaultKernelCreationDriver: ) analysis(assignments) - if self._cfg.index_field is not None: + if self._index_field is not None: ispace = create_sparse_iteration_space( self._ctx, assignments, index_field=self._cfg.index_field ) else: - gls = self._cfg.ghost_layers - islice = self._cfg.iteration_slice - - if gls is None and islice is None: - gls = AUTO + gls: GhostLayerSpec | None + if self._ghost_layers == AUTO: + infer_gls = True + gls = None + else: + assert not isinstance(self._ghost_layers, _AUTO_TYPE) + infer_gls = False + gls = self._ghost_layers ispace = create_full_iteration_space( self._ctx, assignments, ghost_layers=gls, - iteration_slice=islice, + iteration_slice=self._iteration_slice, + infer_ghost_layers=infer_gls, ) self._ctx.set_iteration_space(ispace) @@ -279,7 +292,7 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.cpu_hoist_invariants = kernel_ast.clone() - cpu_cfg = self._cfg.cpu_optim + cpu_cfg = self._cfg.cpu if cpu_cfg is None: return kernel_ast @@ -288,30 +301,41 @@ class DefaultKernelCreationDriver: raise NotImplementedError("Loop blocking not implemented yet.") kernel_ast = self._vectorize(kernel_ast) + kernel_ast = self._add_openmp(kernel_ast) + + if cpu_cfg.use_cacheline_zeroing: + raise NotImplementedError("CL-zeroing not implemented yet") + + return kernel_ast + + def _add_openmp(self, kernel_ast: PsBlock) -> PsBlock: + omp_options = self._cfg.cpu.openmp + enable_omp: bool = omp_options.get_option("enable") - if cpu_cfg.openmp is not False: + if enable_omp: from ..backend.transformations import AddOpenMP - params = ( - cpu_cfg.openmp - if isinstance(cpu_cfg.openmp, OpenMpOptions) - else OpenMpOptions() + add_omp = AddOpenMP( + self._ctx, + nesting_depth=omp_options.get_option("nesting_depth"), + num_threads=omp_options.get_option("num_threads"), + schedule=omp_options.get_option("schedule"), + collapse=omp_options.get_option("collapse"), + omit_parallel=omp_options.get_option("omit_parallel_construct"), ) - add_omp = AddOpenMP(self._ctx, params) kernel_ast = cast(PsBlock, add_omp(kernel_ast)) if self._intermediates is not None: self._intermediates.cpu_openmp = kernel_ast.clone() - if cpu_cfg.use_cacheline_zeroing: - raise NotImplementedError("CL-zeroing not implemented yet") - return kernel_ast def _vectorize(self, kernel_ast: PsBlock) -> PsBlock: - assert self._cfg.cpu_optim is not None - vec_config = self._cfg.cpu_optim.get_vectorization_config() - if vec_config is None: + vec_options = self._cfg.cpu.vectorize + + enable_vec = vec_options.get_option("enable") + + if not enable_vec: return kernel_ast from ..backend.transformations import LoopVectorizer, SelectIntrinsics @@ -328,7 +352,9 @@ class DefaultKernelCreationDriver: inner_loop_dim = ispace.dimensions[inner_loop_coord] # Apply stride (TODO: and alignment) assumptions - if vec_config.assume_inner_stride_one: + assume_unit_stride: bool = vec_options.get_option("assume_inner_stride_one") + + if assume_unit_stride: for field in self._ctx.fields: buf = self._ctx.get_buffer(field) inner_stride = buf.strides[inner_loop_coord] @@ -344,14 +370,14 @@ class DefaultKernelCreationDriver: # TODO: Communicate assumption to runtime system via a precondition # Call loop vectorizer - if vec_config.lanes is None: - lanes = VectorizationConfig.default_lanes( + num_lanes: int | None = vec_options.get_option("lanes") + + if num_lanes is None: + num_lanes = VectorizationOptions.default_lanes( self._target, cast(PsScalarType, self._ctx.default_dtype) ) - else: - lanes = vec_config.lanes - vectorizer = LoopVectorizer(self._ctx, lanes) + vectorizer = LoopVectorizer(self._ctx, num_lanes) def loop_predicate(loop: PsLoop): return loop.counter.symbol == inner_loop_dim.counter @@ -397,15 +423,30 @@ class DefaultKernelCreationDriver: ) elif Target._GPU in self._target: + gpu_opts = self._cfg.gpu + omit_range_check: bool = gpu_opts.get_option("omit_range_check") + match self._target: case Target.SYCL: from ..backend.platforms import SyclPlatform - return SyclPlatform(self._ctx, self._cfg.gpu_indexing) + auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") + + return SyclPlatform( + self._ctx, + omit_range_check=omit_range_check, + automatic_block_size=auto_block_size, + ) case Target.CUDA: from ..backend.platforms import CudaPlatform - return CudaPlatform(self._ctx, self._cfg.gpu_indexing) + manual_grid = gpu_opts.get_option("manual_launch_grid") + + return CudaPlatform( + self._ctx, + omit_range_check=omit_range_check, + manual_launch_grid=manual_grid, + ) raise NotImplementedError( f"Code generation for target {self._target} not implemented" diff --git a/src/pystencils/types/types.py b/src/pystencils/types/types.py index 7645a452f..825ac1d5d 100644 --- a/src/pystencils/types/types.py +++ b/src/pystencils/types/types.py @@ -35,7 +35,7 @@ class PsCustomType(PsType): return self._name def c_string(self) -> str: - return f"{self._const_string()} {self._name}" + return f"{self._const_string()}{self._name}" def __repr__(self) -> str: return f"CustomType( {self.name}, const={self.const} )" diff --git a/tests/codegen/test_config.py b/tests/codegen/test_config.py index 715830e70..0f7591b3e 100644 --- a/tests/codegen/test_config.py +++ b/tests/codegen/test_config.py @@ -2,8 +2,17 @@ import pytest from dataclasses import dataclass import numpy as np -from pystencils.codegen.config import BasicOption, Option, Category, ConfigBase, CreateKernelConfig -from pystencils.types.quick import Int, UInt +from pystencils.codegen.config import ( + BasicOption, + Option, + Category, + ConfigBase, + CreateKernelConfig, + CpuOptions +) +from pystencils.field import Field, FieldType +from pystencils.types.quick import Int, UInt, Fp, Ptr +from pystencils.types import PsVectorType def test_descriptors(): @@ -11,16 +20,19 @@ def test_descriptors(): @dataclass class SampleCategory(ConfigBase): val1: BasicOption[int] = BasicOption(2) - val2: Option[bool, str] = Option(False) + val2: Option[bool, str | bool] = Option(False) @val2.validate - def validate_val2(self, v: str): - if v.lower() in ("off", "false", "no"): - return False - elif v.lower() in ("on", "true", "yes"): - return True - - raise ValueError() + def validate_val2(self, v: str | bool): + if isinstance(v, str): + if v.lower() in ("off", "false", "no"): + return False + elif v.lower() in ("on", "true", "yes"): + return True + + raise ValueError() + else: + return v @dataclass class SampleConfig(ConfigBase): @@ -28,7 +40,7 @@ def test_descriptors(): val: BasicOption[str] = BasicOption("fallback") cfg = SampleConfig() - + # Check unset and default values assert cfg.val is None assert cfg.get_option("val") == "fallback" @@ -55,13 +67,41 @@ def test_descriptors(): cfg.cat = c assert cfg.cat.val1 == 32 assert cfg.cat.val2 is True - + assert cfg.cat is not c c.val1 = 13 assert cfg.cat.val1 == 32 + # Check that category objects on two config objects are not the same + cfg1 = SampleConfig() + cfg2 = SampleConfig() + + assert cfg1.cat is not cfg2.cat + + +def test_category_init(): + cfg1 = CreateKernelConfig() + cfg2 = CreateKernelConfig() + + assert cfg1.cpu is not cfg2.cpu + assert cfg1.cpu.openmp is not cfg2.cpu.openmp + assert cfg1.cpu.vectorize is not cfg2.cpu.vectorize + assert cfg1.gpu is not cfg2.gpu + + +def test_category_copy(): + cfg = CreateKernelConfig() + cpu_repl = CpuOptions() + cpu_repl.openmp.num_threads = 42 + + cfg.cpu = cpu_repl + assert cfg.cpu.openmp.num_threads == 42 + assert cfg.cpu is not cpu_repl + assert cfg.cpu.openmp is not cpu_repl.openmp + def test_config_validation(): + # Check index dtype validation cfg = CreateKernelConfig(index_dtype="int32") assert cfg.index_dtype == Int(32) cfg.index_dtype = np.uint64 @@ -72,3 +112,27 @@ def test_config_validation(): with pytest.raises(ValueError): cfg.index_dtype = "double" + + # Check default dtype validation + cfg = CreateKernelConfig(default_dtype="float32") + assert cfg.default_dtype == Fp(32) + cfg.default_dtype = np.int64 + assert cfg.default_dtype == Int(64) + + with pytest.raises(ValueError): + cfg.default_dtype = PsVectorType(Fp(64), 4) + + with pytest.raises(ValueError): + _ = CreateKernelConfig(default_dtype=Ptr(Fp(32))) + + # Check index field validation + idx_field = Field.create_generic( + "idx", spatial_dimensions=1, field_type=FieldType.INDEXED + ) + cfg.index_field = idx_field + assert cfg.index_field == idx_field + + with pytest.raises(ValueError): + cfg.index_field = Field.create_generic( + "idx", spatial_dimensions=1, field_type=FieldType.GENERIC + ) diff --git a/tests/fixtures.py b/tests/fixtures.py index 8c7f12015..71e54bad8 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -31,15 +31,17 @@ except ImportError: AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] + @pytest.fixture(params=AVAILABLE_TARGETS, ids=TARGET_IDS) def target(request) -> ps.Target: """Provides all code generation targets available on the current hardware""" return request.param + @pytest.fixture def gen_config(target: ps.Target): """Default codegen configuration for the current target. - + For GPU targets, set default indexing options. For vector-CPU targets, set default vectorization config. """ @@ -47,25 +49,24 @@ def gen_config(target: ps.Target): gen_config = ps.CreateKernelConfig(target=target) if target.is_vector_cpu(): - gen_config = replace( - gen_config, - cpu_optim=ps.CpuOptions( - vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) - ), - ) + gen_config.cpu.vectorize.enable = True + gen_config.cpu.vectorize.assume_inner_stride_one = True return gen_config + @pytest.fixture() def xp(target: ps.Target) -> ModuleType: """Primary array module for the current target. - + Returns: `cupy` if `target == Target.CUDA`, and `numpy` otherwise """ if target == ps.Target.CUDA: import cupy as xp + return xp else: import numpy as np + return np diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py index 873cc1213..db8cc2596 100644 --- a/tests/kernelcreation/test_buffer_gpu.py +++ b/tests/kernelcreation/test_buffer_gpu.py @@ -300,7 +300,7 @@ def test_iteration_slices(gpu_indexing): gpu_dst_arr.fill(0) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - gpu_indexing=gpu_indexing) + gpu=gpu_indexing) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -313,7 +313,7 @@ def test_iteration_slices(gpu_indexing): unpack_eqs.append(eq) config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - gpu_indexing=gpu_indexing) + gpu=gpu_indexing) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 57de84b7a..97f0c0fa9 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -112,7 +112,7 @@ def test_ghost_layer(): update_rule = Assignment(dst_field[0, 0], src_field[0, 0]) ghost_layers = [(1, 2), (2, 1)] - config = CreateKernelConfig(target=Target.GPU, ghost_layers=ghost_layers, gpu_indexing="line") + config = CreateKernelConfig(target=Target.GPU, ghost_layers=ghost_layers, gpu="line") ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) kernel = ast.compile() @@ -135,7 +135,7 @@ def test_setting_value(): f = Field.create_generic("f", 2) update_rule = [Assignment(f(0), sp.Symbol("value"))] - config = CreateKernelConfig(target=Target.GPU, gpu_indexing="line", iteration_slice=iteration_slice) + config = CreateKernelConfig(target=Target.GPU, gpu="line", iteration_slice=iteration_slice) ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config) kernel = ast.compile() @@ -207,7 +207,7 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape): f = Field.create_from_numpy_array("f", arr_cpu) update_rule = [Assignment(f.center, sp.Symbol("value"))] - config = CreateKernelConfig(target=Target.GPU, gpu_indexing=gpu_indexing, iteration_slice=iteration_slice) + config = CreateKernelConfig(target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice) ast = create_kernel(update_rule, config=config) kernel = ast.compile() diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index 5c7b4d8cb..892d2d949 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -104,6 +104,9 @@ def test_symbolic_slice(gen_config: CreateKernelConfig, xp): update = Assignment(f.center(), 1) islice = make_slice[sy:ey, sx:ex] gen_config = replace(gen_config, iteration_slice=islice) + + print(repr(gen_config)) + kernel = create_kernel(update, gen_config).compile() for slic in [make_slice[:, :], make_slice[1:-1, 2:-2], make_slice[8:14, 7:11]]: @@ -140,9 +143,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): gen_config = replace(gen_config, iteration_slice=islice) if gen_config.target == Target.CUDA: - gen_config = replace( - gen_config, gpu_indexing=GpuOptions(manual_launch_grid=True) - ) + gen_config.gpu.manual_launch_grid = True kernel = create_kernel(update, gen_config).compile() @@ -170,12 +171,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): outer_counter = DEFAULTS.spatial_counters[0] start = sp.Piecewise((0, sp.Eq(int_rem(outer_counter, 2), 0)), (1, True)) islice = make_slice[:, start::2] - gen_config = replace(gen_config, iteration_slice=islice) + gen_config.iteration_slice = islice if gen_config.target == Target.CUDA: - gen_config = replace( - gen_config, gpu_indexing=GpuOptions(manual_launch_grid=True) - ) + gen_config.gpu.manual_launch_grid = True try: kernel = create_kernel(update, gen_config).compile() diff --git a/tests/nbackend/kernelcreation/test_openmp.py b/tests/nbackend/kernelcreation/test_openmp.py index ae775ca20..07a2f1026 100644 --- a/tests/nbackend/kernelcreation/test_openmp.py +++ b/tests/nbackend/kernelcreation/test_openmp.py @@ -15,20 +15,21 @@ from pystencils.backend.ast.structural import PsLoop, PsPragma @pytest.mark.parametrize("nesting_depth", range(3)) @pytest.mark.parametrize("schedule", ["static", "static,16", "dynamic", "auto"]) -@pytest.mark.parametrize("collapse", range(3)) +@pytest.mark.parametrize("collapse", [None, 1, 2]) @pytest.mark.parametrize("omit_parallel_construct", range(3)) def test_openmp(nesting_depth, schedule, collapse, omit_parallel_construct): f, g = fields("f, g: [3D]") asm = Assignment(f.center(0), g.center(0)) omp = OpenMpOptions( + enable=True, nesting_depth=nesting_depth, schedule=schedule, collapse=collapse, omit_parallel_construct=omit_parallel_construct, ) gen_config = CreateKernelConfig( - target=Target.CPU, cpu_optim=CpuOptions(openmp=omp) + target=Target.CPU, cpu=CpuOptions(openmp=omp) ) kernel = create_kernel(asm, gen_config) @@ -55,7 +56,7 @@ def test_openmp(nesting_depth, schedule, collapse, omit_parallel_construct): expected_tokens = {"omp", "for", f"schedule({omp.schedule})"} if not omp.omit_parallel_construct: expected_tokens.add("parallel") - if omp.collapse > 0: + if omp.collapse is not None: expected_tokens.add(f"collapse({omp.collapse})") assert tokens == expected_tokens diff --git a/tests/nbackend/kernelcreation/test_options.py b/tests/nbackend/kernelcreation/test_options.py deleted file mode 100644 index fefcc98fe..000000000 --- a/tests/nbackend/kernelcreation/test_options.py +++ /dev/null @@ -1,28 +0,0 @@ -import pytest - -from pystencils.field import Field, FieldType -from pystencils.types.quick import * -from pystencils.codegen.config import ( - CreateKernelConfig, - PsOptionsError, -) - - -def test_invalid_iteration_region_options(): - idx_field = Field.create_generic( - "idx", spatial_dimensions=1, field_type=FieldType.INDEXED - ) - with pytest.raises(PsOptionsError): - CreateKernelConfig( - ghost_layers=2, iteration_slice=(slice(1, -1), slice(1, -1)) - ) - with pytest.raises(PsOptionsError): - CreateKernelConfig(ghost_layers=2, index_field=idx_field) - - -def test_index_field_options(): - with pytest.raises(PsOptionsError): - idx_field = Field.create_generic( - "idx", spatial_dimensions=1, field_type=FieldType.GENERIC - ) - CreateKernelConfig(index_field=idx_field) diff --git a/tests/nbackend/transformations/test_add_pragmas.py b/tests/nbackend/transformations/test_add_pragmas.py index 1d8dd1ded..c1749fe28 100644 --- a/tests/nbackend/transformations/test_add_pragmas.py +++ b/tests/nbackend/transformations/test_add_pragmas.py @@ -12,6 +12,7 @@ from pystencils.backend.ast import dfs_preorder from pystencils.backend.ast.structural import PsBlock, PsPragma, PsLoop from pystencils.backend.transformations import InsertPragmasAtLoops, LoopPragma + def test_insert_pragmas(): ctx = KernelCreationContext() factory = AstFactory(ctx) diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py index 3e7f4f071..d27a5e61b 100644 --- a/tests/test_quicktests.py +++ b/tests/test_quicktests.py @@ -74,8 +74,8 @@ def test_basic_vectorization(): ast = ps.create_kernel( update_rule, target=target, - cpu_optim=ps.CpuOptions( - vectorize=ps.VectorizationConfig(assume_inner_stride_one=True) + cpu=ps.CpuOptions( + vectorize=ps.VectorizationOptions(enable=True, assume_inner_stride_one=True) ), ) -- GitLab From 315cc8c0cdafb0b2f91a559c680f246cfb8b62e7 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 20 Jan 2025 15:02:11 +0100 Subject: [PATCH 4/7] Update documentation pages. - Update docs on config system - Strictly separate API reference from user guides - Rename "Reference Guides" to "User Manual" - Clean up API docs - Fix various docstring errors --- docs/Makefile | 2 +- docs/source/api/codegen.rst | 38 ++++++++--- docs/source/api/jit.rst | 4 +- docs/source/api/symbolic/assignments.md | 16 +++++ docs/source/api/{ => symbolic}/field.rst | 4 +- docs/source/api/symbolic/index.md | 9 +++ .../api/{ => symbolic}/sympyextensions.rst | 4 +- docs/source/{reference => api}/types.rst | 0 docs/source/contributing/index.md | 2 +- docs/source/index.rst | 15 ++--- docs/source/{migration.rst => migration.md} | 67 +++++++++++++------ .../{reference => user_manual}/gpu_kernels.md | 9 +-- .../kernelcreation.md | 9 +-- .../symbolic_language.rst | 7 +- src/pystencils/__init__.py | 8 --- .../backend/transformations/add_pragmas.py | 3 +- src/pystencils/codegen/__init__.py | 8 --- src/pystencils/codegen/config.py | 44 ++++++------ 18 files changed, 144 insertions(+), 105 deletions(-) create mode 100644 docs/source/api/symbolic/assignments.md rename docs/source/api/{ => symbolic}/field.rst (97%) create mode 100644 docs/source/api/symbolic/index.md rename docs/source/api/{ => symbolic}/sympyextensions.rst (97%) rename docs/source/{reference => api}/types.rst (100%) rename docs/source/{migration.rst => migration.md} (54%) rename docs/source/{reference => user_manual}/gpu_kernels.md (97%) rename docs/source/{reference => user_manual}/kernelcreation.md (99%) rename docs/source/{reference => user_manual}/symbolic_language.rst (96%) diff --git a/docs/Makefile b/docs/Makefile index a293f14ee..0cfe1ab8b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -22,7 +22,7 @@ html: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) clean: - rm -rf source/reference/generated rm -rf source/api/generated + rm -rf source/api/symbolic/generated rm -rf source/backend/generated @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/api/codegen.rst b/docs/source/api/codegen.rst index d65e9a358..1fb83fe5f 100644 --- a/docs/source/api/codegen.rst +++ b/docs/source/api/codegen.rst @@ -1,5 +1,5 @@ -pystencils.codegen -================== +Code Generation +=============== .. module:: pystencils.codegen @@ -15,16 +15,19 @@ Invocation Configuration ------------- +.. module:: pystencils.codegen.config + .. autosummary:: :toctree: generated :nosignatures: - :template: autosummary/entire_class.rst + :template: autosummary/recursive_class.rst CreateKernelConfig - CpuOptimConfig - OpenMpConfig - VectorizationConfig - GpuIndexingConfig + CpuOptions + OpenMpOptions + VectorizationOptions + GpuOptions + SyclOptions .. autosummary:: :toctree: generated @@ -32,9 +35,24 @@ Configuration AUTO +.. dropdown:: Configuration System Implementation Details + + .. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + Option + BasicOption + Category + ConfigBase + + Target Specification -------------------- +.. module:: pystencils.codegen.target + .. autosummary:: :toctree: generated :nosignatures: @@ -45,12 +63,14 @@ Target Specification Code Generation Drivers ----------------------- +.. module:: pystencils.codegen.driver + .. autosummary:: :toctree: generated :nosignatures: :template: autosummary/entire_class.rst - driver.DefaultKernelCreationDriver + DefaultKernelCreationDriver .. autosummary:: :toctree: generated @@ -61,6 +81,8 @@ Code Generation Drivers Output Code Objects ------------------- +.. currentmodule:: pystencils.codegen + .. autosummary:: :toctree: generated :nosignatures: diff --git a/docs/source/api/jit.rst b/docs/source/api/jit.rst index 7bcd9989c..f2e271db3 100644 --- a/docs/source/api/jit.rst +++ b/docs/source/api/jit.rst @@ -1,5 +1,5 @@ -pystencils.jit -============== +JIT Compilation +=============== .. module:: pystencils.jit diff --git a/docs/source/api/symbolic/assignments.md b/docs/source/api/symbolic/assignments.md new file mode 100644 index 000000000..69446a8a5 --- /dev/null +++ b/docs/source/api/symbolic/assignments.md @@ -0,0 +1,16 @@ +# Assignments and AssignmentCollection + +```{eval-rst} + +.. py:class:: pystencils.Assignment + + Monkeypatched variant of `sympy.codegen.ast.Assignment`. + Represents an assignment of an expression to a symbol. + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/recursive_class.rst + + pystencils.AssignmentCollection +``` diff --git a/docs/source/api/field.rst b/docs/source/api/symbolic/field.rst similarity index 97% rename from docs/source/api/field.rst rename to docs/source/api/symbolic/field.rst index 79cc12a3a..33219c059 100644 --- a/docs/source/api/field.rst +++ b/docs/source/api/symbolic/field.rst @@ -1,5 +1,5 @@ -pystencils.field -================ +Fields +====== .. module:: pystencils.field diff --git a/docs/source/api/symbolic/index.md b/docs/source/api/symbolic/index.md new file mode 100644 index 000000000..fad3df20b --- /dev/null +++ b/docs/source/api/symbolic/index.md @@ -0,0 +1,9 @@ +# Symbolic Toolbox + +:::{toctree} +:maxdepth: 1 + +field +assignments +sympyextensions +::: diff --git a/docs/source/api/sympyextensions.rst b/docs/source/api/symbolic/sympyextensions.rst similarity index 97% rename from docs/source/api/sympyextensions.rst rename to docs/source/api/symbolic/sympyextensions.rst index d377f998e..e3d10fbdf 100644 --- a/docs/source/api/sympyextensions.rst +++ b/docs/source/api/symbolic/sympyextensions.rst @@ -1,5 +1,5 @@ -pystencils.sympyextensions -========================== +Extensions to SymPy +=================== .. module:: pystencils.sympyextensions diff --git a/docs/source/reference/types.rst b/docs/source/api/types.rst similarity index 100% rename from docs/source/reference/types.rst rename to docs/source/api/types.rst diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md index 39e68b06f..04ad821ce 100644 --- a/docs/source/contributing/index.md +++ b/docs/source/contributing/index.md @@ -1,4 +1,4 @@ -# Contributor Guide +# Contribution Guide Welcome to the Contributor's Guide to pystencils! If you are interested in contributing to the development of pystencils, this is the place to start. diff --git a/docs/source/index.rst b/docs/source/index.rst index 5ddec09f2..cb455c8b4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -77,19 +77,18 @@ Topics .. toctree:: :maxdepth: 1 - :caption: Reference Guides + :caption: User Manual - reference/symbolic_language - reference/kernelcreation - reference/gpu_kernels - reference/types + user_manual/symbolic_language + user_manual/kernelcreation + user_manual/gpu_kernels .. toctree:: :maxdepth: 1 - :caption: API + :caption: API Reference - api/field - api/sympyextensions + api/symbolic/index + api/types api/codegen api/jit diff --git a/docs/source/migration.rst b/docs/source/migration.md similarity index 54% rename from docs/source/migration.rst rename to docs/source/migration.md index ea59d8881..c3cb17d0f 100644 --- a/docs/source/migration.rst +++ b/docs/source/migration.md @@ -1,36 +1,62 @@ -.. _page_v2_migration: +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- -*************************** -Version 2.0 Migration Guide -*************************** +(_page_v2_migration)= +# Version 2.0 Migration Guide With version 2.0, many APIs of *pystencils* will be changed; old interfaces are being deprecated and new systems are put in place. This page is a still-incomplete list of these changes, with advice on how to migrate your code from pystencils 1.x to pystencils 2.0. -Kernel Creation -=============== +```{code-cell} ipython3 +:tags: [remove-cell] -Configuration -------------- +import pystencils as ps +``` -The API of `create_kernel`, and the configuration options of the `CreateKernelConfig`, have changed significantly: + +## Kernel Creation + +### Configuration + +The API of {any}`create_kernel`, and the configuration options of the {any}`CreateKernelConfig`, have changed significantly. +The `CreateKernelConfig` class has been refined to be safe to copy and edit incrementally. +The recommended way of setting up the code generator is now *incremental configuration*: + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig() +cfg.default_dtype = "float32" +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 +cfg.ghost_layers = 2 +``` - *Data Types:* `CreateKernelConfig` now takes to parameters to control data types in your kernels: the ``default_dtype`` is applied to all numerical computations, while the ``index_dtype`` is used for all index calculations and loop counters. +- *CPU Optimization Options:* Should now be set via the {any}`cpu <CpuOptions>` option category and its subcategories. .. dropdown:: Deprecated options of `CreateKernelConfig` - ``data_type``: Use ``default_dtype`` instead - - ``cpu_openmp``: Set OpenMP-Options via an `OpenMpConfig` in the ``cpu_optim`` (`CpuOptimConfig`) instead. - - ``cpu_vectorize_info``: Set vectorization options via a `VectorizationConfig` in the ``cpu_optim`` option instead - - ``gpu_indexing_params``: Set GPU indexing options via a `GpuIndexingConfig` in the ``gpu_indexing`` option instead + - ``cpu_openmp``: Set OpenMP-Options in the `cpu.openmp <OpenMpOptions>` category instead. + - ``cpu_vectorize_info``: Set vectorization options in the `cpu.vectorize <VectorizationOptions>` category instead + - ``gpu_indexing_params``: Set GPU indexing options in the `gpu <GpuOptions>` category instead -Type Checking -------------- +### Type Checking The old type checking system of pystencils' code generator has been replaced by a new type inference and validation mechanism whose rules are much stricter than before. @@ -38,24 +64,23 @@ While running `create_kernel`, you may now encounter a `TypificationError` where If this happens, it is probable that you have been doing some illegal, maybe dangerous, or at least unsafe things with data types (like inserting integers into a floating-point context without casting them, or mixing types of different precisions or signedness). If you are sure the error is not your fault, please file an issue at our -`bug tracker <https://i10git.cs.fau.de/pycodegen/pystencils/-/issues>`_. +[bug tracker](https://i10git.cs.fau.de/pycodegen/pystencils/-/issues). -Type System -=========== +### Type System -The ``pystencils.typing`` module has been entirely replaced by the new `pystencils.types` module, +The ``pystencils.typing`` module has been entirely replaced by the new {any}`pystencils.types` module, which is home to a completely new type system. -The primary interaction points with this system are still the `TypedSymbol` class and the `create_type` routine. +The primary interaction points with this system are still the {any}`TypedSymbol` class and the {any}`create_type` routine. Code using any of these two should not require any changes, except: - *Importing `TypedSymbol` and `create_type`:* Both `TypedSymbol` and `create_type` should now be imported directly from the ``pystencils`` namespace. - *Custom data types:* `TypedSymbol` used to accept arbitrary strings as data types. - This is no longer possible; instead, import `pystencils.types.PsCustomType` and use it to describe + This is no longer possible; instead, import {any}`pystencils.types.PsCustomType` and use it to describe custom data types unknown to pystencils, as in ``TypedSymbol("xs", PsCustomType("std::vector< int >"))`` All old data type classes (such as ``BasicType``, ``PointerType``, ``StructType``, etc.) have been removed -and replaced by the class hierarchy below `PsType`. +and replaced by the class hierarchy below {any}`PsType`. Directly using any of these type classes in the frontend is discouraged unless absolutely necessary; in most cases, `create_type` suffices. diff --git a/docs/source/reference/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md similarity index 97% rename from docs/source/reference/gpu_kernels.md rename to docs/source/user_manual/gpu_kernels.md index 786840d18..4db2d7944 100644 --- a/docs/source/reference/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -159,15 +159,10 @@ kernel = ps.create_kernel(assignments, cfg).compile() ``` This warns us that the threads range could not be determined automatically. -We can disable this warning by setting `manual_launch_grid` in the GPU indexing options: +We can disable this warning by setting `manual_launch_grid` in the GPU option category: ```{code-cell} -cfg = ps.CreateKernelConfig( - # ... other options ... - gpu_indexing=ps.GpuIndexingConfig( - manual_launch_grid=True - ) -) +cfg.gpu.manual_launch_grid = True ``` Now, to execute our kernel, we have to manually specify its launch grid: diff --git a/docs/source/reference/kernelcreation.md b/docs/source/user_manual/kernelcreation.md similarity index 99% rename from docs/source/reference/kernelcreation.md rename to docs/source/user_manual/kernelcreation.md index 248855fc1..c85c8f99d 100644 --- a/docs/source/reference/kernelcreation.md +++ b/docs/source/user_manual/kernelcreation.md @@ -485,13 +485,10 @@ h = sp.Symbol("h") cfg = ps.CreateKernelConfig( target=ps.Target.X86_AVX512, default_dtype="float32", - cpu_optim=ps.CpuOptimConfig( - openmp=True, - vectorize=ps.VectorizationConfig( - assume_inner_stride_one=True - ) - ) ) +cfg.cpu.openmp.enable = True +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True assignments = [ ps.Assignment( diff --git a/docs/source/reference/symbolic_language.rst b/docs/source/user_manual/symbolic_language.rst similarity index 96% rename from docs/source/reference/symbolic_language.rst rename to docs/source/user_manual/symbolic_language.rst index 63b94e04d..6d219306e 100644 --- a/docs/source/reference/symbolic_language.rst +++ b/docs/source/user_manual/symbolic_language.rst @@ -42,10 +42,6 @@ Assignments are the fundamental components of pystencils kernels; they are used both for assigning expressions to symbols and for writing values to fields. -.. py:class:: pystencils.Assignment - - Slightly monkey-patched version of `sympy.codegen.ast.Assignment`. - Assignments are combined and structured inside `assignment collections <pystencils.AssignmentCollection>`. An assignment collection contains two separate lists of assignments: @@ -56,10 +52,9 @@ An assignment collection contains two separate lists of assignments: into fields. .. autosummary:: - :toctree: generated :nosignatures: - :template: autosummary/recursive_class.rst + pystencils.Assignment pystencils.AssignmentCollection diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 8c59f7846..a23ce185d 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -3,10 +3,6 @@ from .codegen import ( Target, CreateKernelConfig, - CpuOptions, - VectorizationOptions, - OpenMpOptions, - GpuOptions, AUTO ) from .defaults import DEFAULTS @@ -50,10 +46,6 @@ __all__ = [ "create_numeric_type", "make_slice", "CreateKernelConfig", - "CpuOptions", - "VectorizationOptions", - "GpuOptions", - "OpenMpOptions", "AUTO", "create_kernel", "create_staggered_kernel", diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index f44b89c72..0e6d314ac 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -98,8 +98,7 @@ class InsertPragmasAtLoops: class AddOpenMP: """Apply OpenMP directives to loop nests. - This transformation augments the AST with OpenMP pragmas according to the given - `OpenMpConfig` configuration. + This transformation augments the AST with OpenMP pragmas according to the given configuration. """ def __init__( diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py index 3780527c6..e13f911dd 100644 --- a/src/pystencils/codegen/__init__.py +++ b/src/pystencils/codegen/__init__.py @@ -1,10 +1,6 @@ from .target import Target from .config import ( CreateKernelConfig, - CpuOptions, - VectorizationOptions, - OpenMpOptions, - GpuOptions, AUTO, ) from .parameters import Parameter @@ -14,10 +10,6 @@ from .driver import create_kernel, get_driver __all__ = [ "Target", "CreateKernelConfig", - "CpuOptions", - "VectorizationOptions", - "OpenMpOptions", - "GpuOptions", "AUTO", "Parameter", "Kernel", diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 9abf51222..cb457f673 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -25,7 +25,11 @@ if TYPE_CHECKING: Option_T = TypeVar("Option_T") +"""Type variable for option values""" + + Arg_T = TypeVar("Arg_T") +"""Type variable for option arguments""" class Option(Generic[Option_T, Arg_T]): @@ -35,19 +39,19 @@ class Option(Generic[Option_T, Arg_T]): It maintains a default value for the option that is used when no value was specified by the user. - In configuration options, the value `None` stands for `unset`. + In configuration options, the value `None` stands for ``unset``. It can therefore not be used to set an option to the meaning "not any", or "empty" - for these, special values need to be used. The Option allows a validator function to be specified, which will be called to perform sanity checks on user-provided values. - Through the validator, options may also be set from arguments of a different type (`Arg_T`) - than their value type (`Option_T`). If `Arg_T` is different from `Option_T`, + Through the validator, options may also be set from arguments of a different type (``Arg_T``) + than their value type (``Option_T``). If ``Arg_T`` is different from ``Option_T``, the validator must perform the conversion from the former to the latter. .. note:: - `Arg_T` must always be a supertype of `Option_T`. + ``Arg_T`` must always be a supertype of ``Option_T``. """ def __init__( @@ -99,7 +103,8 @@ class Option(Generic[Option_T, Arg_T]): delattr(obj, self._lookup) -class BasicOption(Option[Option_T, Option_T]): ... # noqa: E701 +class BasicOption(Option[Option_T, Option_T]): + "Subclass of Option where ``Arg_T == Option_T``." class ConfigBase(ABC): @@ -171,6 +176,7 @@ class ConfigBase(ABC): Category_T = TypeVar("Category_T", bound=ConfigBase) +"""Type variable for option categories.""" class Category(Generic[Category_T]): @@ -212,7 +218,7 @@ Currently, these options permit `AUTO`: @dataclass class OpenMpOptions(ConfigBase): - """Parameters controlling kernel parallelization using OpenMP.""" + """Configuration options controlling automatic OpenMP instrumentation.""" enable: BasicOption[bool] = BasicOption(False) """Enable OpenMP instrumentation""" @@ -238,11 +244,7 @@ class OpenMpOptions(ConfigBase): @dataclass class VectorizationOptions(ConfigBase): - """Configuration for the auto-vectorizer. - - If any flag in this configuration is set to a value not supported by the CPU specified - in `CreateKernelConfig.target`, an error will be raised. - """ + """Configuration for the auto-vectorizer.""" enable: BasicOption[bool] = BasicOption(False) """Enable intrinsic vectorization.""" @@ -305,11 +307,7 @@ class VectorizationOptions(ConfigBase): @dataclass class CpuOptions(ConfigBase): - """Configuration for the CPU optimizer. - - If any flag in this configuration is set to a value not supported by the CPU specified - in `CreateKernelConfig.target`, an error will be raised. - """ + """Configuration options specific to CPU targets.""" openmp: Category[OpenMpOptions] = Category(OpenMpOptions()) """Options governing OpenMP-instrumentation. @@ -335,7 +333,7 @@ class CpuOptions(ConfigBase): @dataclass class GpuOptions(ConfigBase): - """Configure index translation behaviour for kernels generated for GPU targets.""" + """Configuration options specific to GPU targets.""" omit_range_check: BasicOption[bool] = BasicOption(False) """If set to `True`, omit the iteration counter range check. @@ -467,13 +465,13 @@ class CreateKernelConfig(ConfigBase): """Target-Specific Options""" cpu: Category[CpuOptions] = Category(CpuOptions()) - """Options for CPU kernels.""" + """Options for CPU kernels. See `CpuOptions`.""" gpu: Category[GpuOptions] = Category(GpuOptions()) - """Options for GPU Kernels.""" + """Options for GPU Kernels. See `GpuOptions`.""" sycl: Category[SyclOptions] = Category(SyclOptions()) - """Options for SYCL kernels.""" + """Options for SYCL kernels. See `SyclOptions`.""" @index_dtype.validate def validate_index_type(self, spec: UserTypeSpec): @@ -503,13 +501,13 @@ class CreateKernelConfig(ConfigBase): """Deprecated; use `default_dtype` instead""" cpu_openmp: InitVar[bool | int | None] = None - """Deprecated; use `cpu_optim.openmp <CpuOptimConfig.openmp>` instead.""" + """Deprecated; use `cpu.openmp <CpuOptions.openmp>` instead.""" cpu_vectorize_info: InitVar[dict | None] = None - """Deprecated; use `cpu_optim.vectorize <CpuOptimConfig.vectorize>` instead.""" + """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead.""" gpu_indexing_params: InitVar[dict | None] = None - """Deprecated; use `gpu_indexing` instead.""" + """Deprecated; set options in the `gpu` category instead.""" # Getters -- GitLab From 958a499f4a529c4ca1636e306fdecc327da13759 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 20 Jan 2025 15:27:51 +0100 Subject: [PATCH 5/7] fix testsuite --- .../01_tutorial_getting_started.ipynb | 10 ++++--- tests/kernelcreation/test_buffer_gpu.py | 6 ++-- tests/kernelcreation/test_iteration_slices.py | 5 ++-- tests/nbackend/kernelcreation/test_openmp.py | 26 +++++++----------- .../test_data/datahandling_save_test.npz | Bin 428 -> 410 bytes tests/test_quicktests.py | 11 +++----- 6 files changed, 24 insertions(+), 34 deletions(-) diff --git a/docs/source/tutorials/01_tutorial_getting_started.ipynb b/docs/source/tutorials/01_tutorial_getting_started.ipynb index 04dc50e51..f6c92a6bb 100644 --- a/docs/source/tutorials/01_tutorial_getting_started.ipynb +++ b/docs/source/tutorials/01_tutorial_getting_started.ipynb @@ -1138,7 +1138,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1262,11 +1262,13 @@ } ], "source": [ + "cfg = ps.CreateKernelConfig()\n", + "cfg.cpu.openmp.enable = True\n", + "cfg.cpu.openmp.num_threads = 2\n", "ast = ps.create_kernel(\n", " update_rule,\n", - " cpu = ps.CpuOptions(\n", - " openmp=ps.OpenMpOptions(enable=True, num_threads=2))\n", - " )\n", + " cfg\n", + ")\n", "\n", "ps.show_code(ast)" ] diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py index db8cc2596..0b5019fba 100644 --- a/tests/kernelcreation/test_buffer_gpu.py +++ b/tests/kernelcreation/test_buffer_gpu.py @@ -299,8 +299,7 @@ def test_iteration_slices(gpu_indexing): gpu_src_arr.set(src_arr) gpu_dst_arr.fill(0) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - gpu=gpu_indexing) + config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -312,8 +311,7 @@ def test_iteration_slices(gpu_indexing): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, - gpu=gpu_indexing) + config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index 892d2d949..02b6b9922 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -13,7 +13,6 @@ from pystencils import ( make_slice, Target, CreateKernelConfig, - GpuOptions, DynamicType, ) from pystencils.sympyextensions.integer_functions import int_rem @@ -81,7 +80,7 @@ def test_numerical_slices(gen_config: CreateKernelConfig, xp, islice): try: kernel = create_kernel(update, gen_config).compile() except NotImplementedError: - if gen_config.target.is_vector_cpu(): + if gen_config.get_target().is_vector_cpu(): # TODO Gather/Scatter not implemented yet pytest.xfail("Gather/Scatter not available yet") @@ -179,7 +178,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): try: kernel = create_kernel(update, gen_config).compile() except NotImplementedError: - if gen_config.target.is_vector_cpu(): + if gen_config.get_target().is_vector_cpu(): pytest.xfail("Gather/Scatter not implemented yet") if isinstance(kernel, CupyKernelWrapper): diff --git a/tests/nbackend/kernelcreation/test_openmp.py b/tests/nbackend/kernelcreation/test_openmp.py index 07a2f1026..4e24cd1b2 100644 --- a/tests/nbackend/kernelcreation/test_openmp.py +++ b/tests/nbackend/kernelcreation/test_openmp.py @@ -4,8 +4,6 @@ from pystencils import ( Assignment, create_kernel, CreateKernelConfig, - CpuOptions, - OpenMpOptions, Target, ) @@ -21,16 +19,12 @@ def test_openmp(nesting_depth, schedule, collapse, omit_parallel_construct): f, g = fields("f, g: [3D]") asm = Assignment(f.center(0), g.center(0)) - omp = OpenMpOptions( - enable=True, - nesting_depth=nesting_depth, - schedule=schedule, - collapse=collapse, - omit_parallel_construct=omit_parallel_construct, - ) - gen_config = CreateKernelConfig( - target=Target.CPU, cpu=CpuOptions(openmp=omp) - ) + gen_config = CreateKernelConfig(target=Target.CPU) + gen_config.cpu.openmp.enable = True + gen_config.cpu.openmp.nesting_depth = nesting_depth + gen_config.cpu.openmp.schedule = schedule + gen_config.cpu.openmp.collapse = collapse + gen_config.cpu.openmp.omit_parallel_construct = omit_parallel_construct kernel = create_kernel(asm, gen_config) ast = kernel.body @@ -53,10 +47,10 @@ def test_openmp(nesting_depth, schedule, collapse, omit_parallel_construct): pragma = find_omp_pragma(ast) tokens = set(pragma.text.split()) - expected_tokens = {"omp", "for", f"schedule({omp.schedule})"} - if not omp.omit_parallel_construct: + expected_tokens = {"omp", "for", f"schedule({schedule})"} + if not omit_parallel_construct: expected_tokens.add("parallel") - if omp.collapse is not None: - expected_tokens.add(f"collapse({omp.collapse})") + if collapse is not None: + expected_tokens.add(f"collapse({collapse})") assert tokens == expected_tokens diff --git a/tests/runtime/test_data/datahandling_save_test.npz b/tests/runtime/test_data/datahandling_save_test.npz index 22202358a4fa1d1cea4db89c0889f5bca636598b..d363a8a0aba1bb78a06314a19b887eb4c4975334 100644 GIT binary patch literal 410 zcmWIWW@Zs#U|`??Vnv4TVm_%5Ad7*Ofq|VtgrT@7Sud}kl953GECiAPO9ScIZ^U0o z3!FR=a4cZ$yh%}WVwU7BU6409ZQ;7b3+7FW4+)wwLwtVxlu2Ad{F++6tX$&hDq>5R zc1o#PaXF-{T)8-4wS(G&B!*`GZ;QWZ*n0I}`m&5M0Iy?Gic9G07)B-$W?W$d3JM5l iU<A?7kP7f7R#Puf6Vyim-mGjOGnjxd3rI_WO#}cXWnnh} literal 428 zcmWIWW@Zs#U|`??Vnv4TVm_%@Ad7*Ofq|VtgrT@7Sud}kl953GECiAPO9ScIZ^U0o z3!FR=a4cZ$yh%}WVwU7BU6409ZQ;7b3+7FW4+)wwLwtVxlu2Ad{F++6R|5m|&w1#0 zgl-SIU~n-Ih&Eig61XB%LAb!M;UN3(z$z0Nx621J>=G6f+c7W%;B{3>amfI=ijhf# q8CPh50tW&b7(p~N>;k-r)ie#L3F@@~Z&o&t8B9Q!1*CPrCISFllx#cz diff --git a/tests/test_quicktests.py b/tests/test_quicktests.py index d27a5e61b..9cefc84c5 100644 --- a/tests/test_quicktests.py +++ b/tests/test_quicktests.py @@ -71,13 +71,10 @@ def test_basic_vectorization(): update_rule = [ ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0) ] - ast = ps.create_kernel( - update_rule, - target=target, - cpu=ps.CpuOptions( - vectorize=ps.VectorizationOptions(enable=True, assume_inner_stride_one=True) - ), - ) + cfg = ps.CreateKernelConfig(target=target) + cfg.cpu.vectorize.enable = True + cfg.cpu.vectorize.assume_inner_stride_one = True + ast = ps.create_kernel(update_rule, cfg) func = ast.compile() -- GitLab From 3e1cf7b09fd5b409027b01cb08d19072d329fd5c Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 20 Jan 2025 15:36:32 +0100 Subject: [PATCH 6/7] add test_override --- tests/codegen/test_config.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/codegen/test_config.py b/tests/codegen/test_config.py index 0f7591b3e..f7f29b760 100644 --- a/tests/codegen/test_config.py +++ b/tests/codegen/test_config.py @@ -136,3 +136,26 @@ def test_config_validation(): cfg.index_field = Field.create_generic( "idx", spatial_dimensions=1, field_type=FieldType.GENERIC ) + + +def test_override(): + cfg1 = CreateKernelConfig() + cfg1.function_name = "test" + cfg1.cpu.openmp.schedule = "dynamic" + cfg1.gpu.manual_launch_grid = False + cfg1.allow_double_writes = True + + cfg2 = CreateKernelConfig() + cfg2.function_name = "func" + cfg2.cpu.openmp.schedule = "static(5)" + cfg2.cpu.vectorize.lanes = 12 + cfg2.allow_double_writes = False + + cfg1.override(cfg2) + + assert cfg1.function_name == "func" + assert cfg1.cpu.openmp.schedule == "static(5)" + assert cfg1.cpu.openmp.enable is None + assert cfg1.cpu.vectorize.lanes == 12 + assert cfg1.cpu.vectorize.assume_aligned is None + assert cfg1.allow_double_writes is False -- GitLab From 9c97cccb51c5596922cd027f0c9fc95bbf65e3a7 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 22 Jan 2025 13:47:09 +0100 Subject: [PATCH 7/7] some minor code + docs cleanup --- docs/source/api/codegen.md | 180 ++++++++++++++++++ docs/source/api/codegen.rst | 94 --------- docs/source/migration.md | 12 +- .../backend/kernelcreation/iteration_space.py | 6 +- src/pystencils/codegen/config.py | 19 +- .../test_data/datahandling_save_test.npz | Bin 410 -> 420 bytes 6 files changed, 195 insertions(+), 116 deletions(-) create mode 100644 docs/source/api/codegen.md delete mode 100644 docs/source/api/codegen.rst diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md new file mode 100644 index 000000000..b739a4f33 --- /dev/null +++ b/docs/source/api/codegen.md @@ -0,0 +1,180 @@ +# Code Generation + +## Invocation + +```{eval-rst} +.. module:: pystencils.codegen + +.. autosummary:: + :toctree: generated + :nosignatures: + + create_kernel +``` + +## Configuration + +```{eval-rst} +.. module:: pystencils.codegen.config +``` + +The code generation driver (`create_kernel`, but also `DefaultKernelCreationDriver`) can be configured by +passing it a `CreateKernelConfig` object. +This object can be constructed incrementally: + +```Python +cfg = ps.CreateKernelConfig() +cfg.default_dtype = "float32" +cfg.target = ps.Target.X86_AVX +cfg.cpu.openmp.enable = True +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True +``` + +### Options and Option Categories + +The following options and option categories are exposed by the configuration object: + +#### Target Specification + +```{eval-rst} +.. current + +.. autosummary:: + + ~CreateKernelConfig.target +``` + +#### Data Types + +```{eval-rst} +.. autosummary:: + + ~CreateKernelConfig.default_dtype + ~CreateKernelConfig.index_dtype +``` + +#### Iteration Space + +```{eval-rst} +.. autosummary:: + + ~CreateKernelConfig.ghost_layers + ~CreateKernelConfig.iteration_slice + ~CreateKernelConfig.index_field +``` + +#### Kernel Constraint Checks + +```{eval-rst} +.. autosummary:: + + ~CreateKernelConfig.allow_double_writes + ~CreateKernelConfig.skip_independence_check +``` + +#### Target-Specific Options + +The following categories with target-specific options are exposed: + +| | | +|---------------------------|--------------------------| +| {any}`cpu <CpuOptions>` | Options for CPU kernels | +| {any}`gpu <GpuOptions>` | Options for GPU kernels | +| {any}`sycl <SyclOptions>` | Options for SYCL kernels | + + +#### Kernel Object and Just-In-Time Compilation + +```{eval-rst} +.. autosummary:: + + ~CreateKernelConfig.function_name + ~CreateKernelConfig.jit +``` + +### Configuration System Classes + +```{eval-rst} + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/recursive_class.rst + + CreateKernelConfig + CpuOptions + OpenMpOptions + VectorizationOptions + GpuOptions + SyclOptions + +.. autosummary:: + :toctree: generated + :nosignatures: + + AUTO + +.. dropdown:: Implementation Details + + .. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + Option + BasicOption + Category + ConfigBase + +``` + +## Target Specification + +```{eval-rst} + +.. module:: pystencils.codegen.target + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/recursive_class.rst + + Target + +``` + +## Code Generation Drivers + +```{eval-rst} +.. module:: pystencils.codegen.driver + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + DefaultKernelCreationDriver + +.. autosummary:: + :toctree: generated + :nosignatures: + + get_driver +``` + +## Output Code Objects + +```{eval-rst} +.. currentmodule:: pystencils.codegen + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + Kernel + GpuKernel + Parameter + GpuThreadsRange +``` diff --git a/docs/source/api/codegen.rst b/docs/source/api/codegen.rst deleted file mode 100644 index 1fb83fe5f..000000000 --- a/docs/source/api/codegen.rst +++ /dev/null @@ -1,94 +0,0 @@ -Code Generation -=============== - -.. module:: pystencils.codegen - -Invocation ----------- - -.. autosummary:: - :toctree: generated - :nosignatures: - - create_kernel - -Configuration -------------- - -.. module:: pystencils.codegen.config - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/recursive_class.rst - - CreateKernelConfig - CpuOptions - OpenMpOptions - VectorizationOptions - GpuOptions - SyclOptions - -.. autosummary:: - :toctree: generated - :nosignatures: - - AUTO - -.. dropdown:: Configuration System Implementation Details - - .. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/entire_class.rst - - Option - BasicOption - Category - ConfigBase - - -Target Specification --------------------- - -.. module:: pystencils.codegen.target - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/recursive_class.rst - - Target - -Code Generation Drivers ------------------------ - -.. module:: pystencils.codegen.driver - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/entire_class.rst - - DefaultKernelCreationDriver - -.. autosummary:: - :toctree: generated - :nosignatures: - - get_driver - -Output Code Objects -------------------- - -.. currentmodule:: pystencils.codegen - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/entire_class.rst - - Kernel - GpuKernel - Parameter - GpuThreadsRange diff --git a/docs/source/migration.md b/docs/source/migration.md index c3cb17d0f..bb4a2cffb 100644 --- a/docs/source/migration.md +++ b/docs/source/migration.md @@ -48,12 +48,14 @@ cfg.ghost_layers = 2 for all index calculations and loop counters. - *CPU Optimization Options:* Should now be set via the {any}`cpu <CpuOptions>` option category and its subcategories. -.. dropdown:: Deprecated options of `CreateKernelConfig` +:::{dropdown} Deprecated options of `CreateKernelConfig` - - ``data_type``: Use ``default_dtype`` instead - - ``cpu_openmp``: Set OpenMP-Options in the `cpu.openmp <OpenMpOptions>` category instead. - - ``cpu_vectorize_info``: Set vectorization options in the `cpu.vectorize <VectorizationOptions>` category instead - - ``gpu_indexing_params``: Set GPU indexing options in the `gpu <GpuOptions>` category instead +- ``data_type``: Use ``default_dtype`` instead +- ``cpu_openmp``: Set OpenMP-Options in the `cpu.openmp <OpenMpOptions>` category instead. +- ``cpu_vectorize_info``: Set vectorization options in the `cpu.vectorize <VectorizationOptions>` category instead +- ``gpu_indexing_params``: Set GPU indexing options in the `gpu <GpuOptions>` category instead + +::: ### Type Checking diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py index f7425c06b..313377fd8 100644 --- a/src/pystencils/backend/kernelcreation/iteration_space.py +++ b/src/pystencils/backend/kernelcreation/iteration_space.py @@ -453,11 +453,7 @@ def create_full_iteration_space( assert not ctx.fields.index_fields - if not ( - (ghost_layers is not None) - or (iteration_slice is not None) - or infer_ghost_layers - ): + if (ghost_layers is None) and (iteration_slice is None) and not infer_ghost_layers: raise ValueError( "One argument of `ghost_layers`, `iteration_slice`, and `infer_ghost_layers` must be set." ) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index cb457f673..cbb3f4f32 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -82,17 +82,17 @@ class Option(Generic[Option_T, Arg_T]): def is_set(self, obj) -> bool: return getattr(obj, self._lookup, None) is not None - def __set_name__(self, owner, name: str): + def __set_name__(self, owner: ConfigBase, name: str): self._name = name self._lookup = f"_{name}" - def __get__(self, obj, objtype=None) -> Option_T | None: + def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Option_T | None: if obj is None: return None return getattr(obj, self._lookup, None) - def __set__(self, obj, arg: Arg_T | None): + def __set__(self, obj: ConfigBase, arg: Arg_T | None): if arg is not None and self._validator is not None: value = self._validator(obj, arg) else: @@ -190,17 +190,17 @@ class Category(Generic[Category_T]): def __init__(self, default: Category_T): self._default = default - def __set_name__(self, owner, name: str): + def __set_name__(self, owner: ConfigBase, name: str): self._name = name self._lookup = f"_{name}" - def __get__(self, obj, objtype=None) -> Category_T: + def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Category_T: if obj is None: return self._default return cast(Category_T, getattr(obj, self._lookup, None)) - def __set__(self, obj, cat: Category_T): + def __set__(self, obj: ConfigBase, cat: Category_T): setattr(obj, self._lookup, cat.copy()) @@ -208,12 +208,7 @@ class _AUTO_TYPE: ... # noqa: E701 AUTO = _AUTO_TYPE() -"""Special value that can be passed to some options for invoking automatic behaviour. - -Currently, these options permit `AUTO`: - -- `ghost_layers <CreateKernelConfig.ghost_layers>` -""" +"""Special value that can be passed to some options for invoking automatic behaviour.""" @dataclass diff --git a/tests/runtime/test_data/datahandling_save_test.npz b/tests/runtime/test_data/datahandling_save_test.npz index d363a8a0aba1bb78a06314a19b887eb4c4975334..486c7ee74d4421d563c3b1c2e3739d8db6308b07 100644 GIT binary patch literal 420 zcmWIWW@Zs#U|`??Vnv4TVm_%zAd7*Ofq|VtgrT@7Sud}kl953GECiAPO9ScIZ^U0o z3!FR=a4cZ$yh%}WVwU7BU6409ZQ;7b3+7FW4+)wwLwtVxlu2Ad{F++6R|5m|&w1#0 zgl-SIU~n-Ih&Eig61XB%LAb!M;UN3(z^W;7OWiZstS&PI;B`?-amfI<h>=Nz8CNKQ m0tEsZ7(p~N%mTcL)iepH3F@r?Z&o&t8B9Q!1*BELCISH25ot;Q literal 410 zcmWIWW@Zs#U|`??Vnv4TVm_%5Ad7*Ofq|VtgrT@7Sud}kl953GECiAPO9ScIZ^U0o z3!FR=a4cZ$yh%}WVwU7BU6409ZQ;7b3+7FW4+)wwLwtVxlu2Ad{F++6tX$&hDq>5R zc1o#PaXF-{T)8-4wS(G&B!*`GZ;QWZ*n0I}`m&5M0Iy?Gic9G07)B-$W?W$d3JM5l iU<A?7kP7f7R#Puf6Vyim-mGjOGnjxd3rI_WO#}cXWnnh} -- GitLab