diff --git a/pystencils/__init__.py b/pystencils/__init__.py index a7e21703b48c7398da8be6f609f0f3123f7822d1..cdfa7f3cc9b19cd044e17136c406f4cecffa7755 100644 --- a/pystencils/__init__.py +++ b/pystencils/__init__.py @@ -1,17 +1,19 @@ """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions""" from . import sympy_gmpy_bug_workaround # NOQA -from . import fd from . import stencil as stencil -from .assignment import Assignment, assignment_from_stencil -from .data_types import TypedSymbol -from .datahandling import create_data_handling -from .display_utils import show_code, to_dot from .field import Field, FieldType, fields -from .kernel_decorator import kernel -from .kernelcreation import create_indexed_kernel, create_kernel, create_staggered_kernel -from .simp import AssignmentCollection +from .data_types import TypedSymbol, matrix_symbols, typed_symbols from .slicing import make_slice +from .kernelcreation import create_kernel, create_indexed_kernel, create_staggered_kernel +from .display_utils import show_code, to_dot +from .simp import AssignmentCollection +from .assignment import Assignment, assignment_from_stencil from .sympyextensions import SymbolCreator +from .datahandling import create_data_handling +from .kernel_decorator import kernel +from . import fd +from .spatial_coordinates import (x_, x_staggered, x_staggered_vector, x_vector, + y_, y_staggered, z_, z_staggered) __all__ = ['Field', 'FieldType', 'fields', 'TypedSymbol', @@ -25,4 +27,8 @@ __all__ = ['Field', 'FieldType', 'fields', 'create_data_handling', 'kernel', 'fd', + 'x_', 'y_', 'z_', + 'x_staggered', 'y_staggered', 'z_staggered', + 'x_vector', 'x_staggered_vector', + 'typed_symbols', 'matrix_symbols', 'stencil'] diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index 7e43c1a8a9a3fe7efd092840226e46f529648df3..a8c34cff6ac3fb873f1e50e0ab508056e0aab9b1 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -33,7 +33,7 @@ class Node: raise NotImplementedError() def subs(self, subs_dict) -> None: - """Inplace! substitute, similar to sympy's but modifies the AST inplace.""" + """Inplace! Substitute, similar to sympy's but modifies the AST inplace.""" for a in self.args: a.subs(subs_dict) @@ -102,7 +102,8 @@ class Conditional(Node): result = self.true_block.undefined_symbols if self.false_block: result.update(self.false_block.undefined_symbols) - result.update(self.condition_expr.atoms(sp.Symbol)) + if hasattr(self.condition_expr, 'atoms'): + result.update(self.condition_expr.atoms(sp.Symbol)) return result def __str__(self): @@ -227,7 +228,8 @@ class KernelFunction(Node): return () argument_symbols = self._body.undefined_symbols - self.global_variables - parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols] + parameters = [self.Parameter(symbol, get_fields(symbol)) + for symbol in argument_symbols] if hasattr(self, 'indexing'): parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()] parameters.sort(key=lambda p: p.symbol.name) diff --git a/pystencils/backends/cuda_backend.py b/pystencils/backends/cuda_backend.py index 3c1a3cd8bf9629c0b003cc8b6c38693c4426e282..915d5601b5fe7bc05b9833c21da62915b22552e5 100644 --- a/pystencils/backends/cuda_backend.py +++ b/pystencils/backends/cuda_backend.py @@ -1,7 +1,12 @@ from os.path import dirname, join +import numpy as np +import sympy +from sympy.core.compatibility import string_types + from pystencils.astnodes import Node from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c +from pystencils.data_types import cast_func, create_type, get_type_of_expression from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f: @@ -43,11 +48,19 @@ class CudaBackend(CBackend): return code def _print_TextureDeclaration(self, node): - code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % ( - str(node.texture.field.dtype), - node.texture.field.spatial_dimensions, - node.texture - ) + + if node.texture.field.dtype.numpy_dtype.itemsize > 4: + code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % ( + str(node.texture.field.dtype), + node.texture.field.spatial_dimensions, + node.texture + ) + else: + code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % ( + str(node.texture.field.dtype), + node.texture.field.spatial_dimensions, + node.texture + ) return code def _print_SkipIteration(self, _): @@ -62,17 +75,23 @@ class CudaSympyPrinter(CustomSympyPrinter): self.known_functions.update(CUDA_KNOWN_FUNCTIONS) def _print_TextureAccess(self, node): + dtype = node.texture.field.dtype.numpy_dtype if node.texture.cubic_bspline_interpolation: template = "cubicTex%iDSimple<%s>(%s, %s)" else: - template = "tex%iD<%s>(%s, %s)" + if dtype.itemsize > 4: + # Use PyCuda hack! + # https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp + template = "fp_tex%iD(%s, %s)" + else: + template = "tex%iD(%s, %s)" code = template % ( node.texture.field.spatial_dimensions, - str(node.texture.field.dtype), str(node.texture), - ', '.join(self._print(o) for o in node.offsets) + # + 0.5 comes from Nvidia's staggered indexing + ', '.join(self._print(o + 0.5) for o in reversed(node.offsets)) ) return code @@ -84,3 +103,78 @@ class CudaSympyPrinter(CustomSympyPrinter): elif isinstance(expr, fast_inv_sqrt): return "__frsqrt_rn(%s)" % tuple(self._print(a) for a in expr.args) return super()._print_Function(expr) + + def _print_Pow(self, expr): + if np.issubdtype(get_type_of_expression(expr.args[0], + default_float_type=self._float_type).numpy_dtype, np.integer): + expr._args[0] = cast_func(expr.args[0], create_type(self._float_type)) + + if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8: + return "(" + self._print(sympy.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")" + elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0: + return "1 / ({})".format(self._print(sympy.Mul(*[expr.base] * (-expr.exp), evaluate=False))) + else: + + try: + suffix = self._get_func_suffix(get_type_of_expression( + expr.args[0], default_float_type=self._float_type).sympy_dtype) + except Exception: + suffix = "" + if expr.exp == -1: + return '1.0%s/%s' % (suffix.upper(), self.parenthesize(expr.base, sympy.PREC)) + elif expr.exp == 0.5: + return '%ssqrt%s(%s)' % (self._ns, suffix, self._print(expr.base)) + elif expr.exp == sympy.S.One / 3 and self.standard != 'C89': + return '%scbrt%s(%s)' % (self._ns, suffix, self._print(expr.base)) + else: + return '%spow%s(%s, %s)' % (self._ns, suffix, self._print(expr.base), + self._print(expr.exp)) + + def _print_math_func(self, expr, nest=False, known=None): + if np.issubdtype(get_type_of_expression(expr.args[0], + default_float_type=self._float_type).numpy_dtype, np.integer): + expr._args = (cast_func(expr.args[0], self._float_type), *expr.args[1:]) + + if len(expr.args) == 2 and \ + (get_type_of_expression(expr.args[0], self._float_type) != # NOQA + get_type_of_expression(expr.args[1], default_float_type=self._float_type)): + expr._args = (expr.args[0], cast_func(expr.args[1], get_type_of_expression( + expr.args[0], default_float_type=self._float_type))) + + if known is None: + known = self.known_functions[expr.__class__.__name__] + if not isinstance(known, string_types): + for cb, name in known: + if cb(*expr.args): + known = name + break + else: + raise ValueError("No matching printer") + suffix = self._get_func_suffix( + get_type_of_expression(expr.args[0], default_float_type=self._float_type + ).sympy_dtype) if self._ns + known in self._prec_funcs else '' + + if nest: + args = self._print(expr.args[0]) + if len(expr.args) > 1: + paren_pile = '' + for curr_arg in expr.args[1:-1]: + paren_pile += ')' + args += ', {ns}{name}{suffix}({next}'.format( + ns=self._ns, + name=known, + suffix=suffix, + next=self._print(curr_arg) + ) + args += ', %s%s' % ( + self._print(expr.func(expr.args[-1])), + paren_pile + ) + else: + args = ', '.join(map(lambda arg: self._print(arg), expr.args)) + return '{ns}{name}{suffix}({args})'.format( + ns=self._ns, + name=known, + suffix=suffix, + args=args + ) diff --git a/pystencils/backends/cuda_known_functions.txt b/pystencils/backends/cuda_known_functions.txt index 42cf554ada57d9055bddc659d53c4eaff664275a..224f4a49d65322446a6b17c696351f9371435887 100644 --- a/pystencils/backends/cuda_known_functions.txt +++ b/pystencils/backends/cuda_known_functions.txt @@ -45,6 +45,7 @@ tex1D tex2D tex3D +sqrtf rsqrtf cbrtf rcbrtf diff --git a/pystencils/cpu/kernelcreation.py b/pystencils/cpu/kernelcreation.py index fc6765e589e341b40ebbc778c7c26839531ae826..7859c537fe3fbe50a2e94bb2c2c67622f1c1f852 100644 --- a/pystencils/cpu/kernelcreation.py +++ b/pystencils/cpu/kernelcreation.py @@ -10,8 +10,8 @@ from pystencils.data_types import BasicType, StructType, TypedSymbol, create_typ from pystencils.field import Field, FieldType from pystencils.transformations import ( add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering, - make_loop_over_domain, move_constants_before_loop, parse_base_pointer_info, - resolve_buffer_accesses, resolve_field_accesses, split_inner_loop) + implement_interpolations, make_loop_over_domain, move_constants_before_loop, + parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, split_inner_loop) AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]] @@ -67,6 +67,7 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ghost_layers=ghost_layers, loop_order=loop_order) ast_node = KernelFunction(loop_node, 'cpu', 'c', compile_function=make_python_function, ghost_layers=ghost_layer_info, function_name=function_name) + implement_interpolations(body) if split_groups: typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups] @@ -139,6 +140,8 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu loop_body = Block([]) loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0]) + implement_interpolations(loop_node) + for assignment in assignments: loop_body.append(assignment) diff --git a/pystencils/data_types.py b/pystencils/data_types.py index 6bae2da8de109085421dab6a5896f96e7bf9c3e2..ef2ef558db830752264d08d7113cb6c3ee3fe99c 100644 --- a/pystencils/data_types.py +++ b/pystencils/data_types.py @@ -1,10 +1,14 @@ import ctypes +import uuid +from typing import Tuple import numpy as np import sympy as sp +import sympy.codegen.ast from sympy.core.cache import cacheit from sympy.logic.boolalg import Boolean +import pystencils from pystencils.cache import memorycache from pystencils.utils import all_equal @@ -15,6 +19,30 @@ except ImportError as e: _ir_importerror = e +def typed_symbols(names, dtype, *args): + symbols = sp.symbols(names, *args) + if isinstance(symbols, Tuple): + return tuple(TypedSymbol(str(s), dtype) for s in symbols) + else: + return TypedSymbol(str(symbols), dtype) + + +def matrix_symbols(names, dtype, rows, cols): + if isinstance(names, str): + names = names.replace(' ', '').split(',') + + matrices = [] + for n in names: + symbols = typed_symbols("%s:%i" % (n, rows * cols), dtype) + matrices.append(sp.Matrix(rows, cols, lambda i, j: symbols[i * cols + j])) + + return tuple(matrices) + + +def get_dummy_symbol(dtype=None): + return pystencils.typed_symbols('dummy%s' % uuid.uuid4().hex, dtype or create_type('bool')) + + # noinspection PyPep8Naming class address_of(sp.Function): is_Atom = True @@ -329,12 +357,17 @@ def peel_off_type(dtype, type_to_peel_off): return dtype -def collate_types(types): +def collate_types(types, forbid_collation_to_float=False): """ Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double Uses the collation rules from numpy. """ + if forbid_collation_to_float: + types = [t for t in types if not (hasattr(t, 'is_float') and t.is_float())] + if not types: + return [create_type('int32')] + # Pointer arithmetic case i.e. pointer + integer is allowed if any(type(t) is PointerType for t in types): pointer_type = None @@ -370,17 +403,19 @@ def collate_types(types): @memorycache(maxsize=2048) -def get_type_of_expression(expr): +def get_type_of_expression(expr, default_float_type="double", default_int_type='int'): from pystencils.astnodes import ResolvedFieldAccess from pystencils.cpu.vectorization import vec_all, vec_any expr = sp.sympify(expr) if isinstance(expr, sp.Integer): - return create_type("int") + return create_type(default_int_type) elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float): - return create_type("double") + return create_type(default_float_type) elif isinstance(expr, ResolvedFieldAccess): return expr.field.dtype + elif isinstance(expr, pystencils.field.Field.AbstractAccess): + return expr.field.dtype elif isinstance(expr, TypedSymbol): return expr.dtype elif isinstance(expr, sp.Symbol): @@ -401,16 +436,23 @@ def get_type_of_expression(expr): elif isinstance(expr, sp.boolalg.Boolean) or isinstance(expr, sp.boolalg.BooleanFunction): # if any arg is of vector type return a vector boolean, else return a normal scalar boolean result = create_type("bool") - vec_args = [get_type_of_expression(a) for a in expr.args if isinstance(get_type_of_expression(a), VectorType)] + vec_args = [get_type_of_expression(a) + for a in expr.args if isinstance(get_type_of_expression(a), VectorType)] if vec_args: result = VectorType(result, width=vec_args[0].width) return result elif isinstance(expr, sp.Pow): return get_type_of_expression(expr.args[0]) elif isinstance(expr, sp.Expr): - types = tuple(get_type_of_expression(a) for a in expr.args) - return collate_types(types) - + expr: sp.Expr + if expr.args: + types = tuple(get_type_of_expression(a) for a in expr.args) + return collate_types(types) + else: + if expr.is_integer: + return create_type(default_int_type) + else: + return create_type(default_float_type) raise NotImplementedError("Could not determine type for", expr, type(expr)) @@ -463,6 +505,10 @@ class BasicType(Type): def numpy_dtype(self): return self._dtype + @property + def sympy_dtype(self): + return getattr(sympy.codegen.ast, str(self.numpy_dtype)) + @property def item_size(self): return 1 diff --git a/pystencils/field.py b/pystencils/field.py index b6437a971398c70c0bcf392129269b87c912f0b8..be0f2b3801fd280156abce7fda7f3ecb20bf3885 100644 --- a/pystencils/field.py +++ b/pystencils/field.py @@ -1,4 +1,6 @@ +import functools import hashlib +import operator import pickle import re from enum import Enum @@ -9,6 +11,7 @@ import numpy as np import sympy as sp from sympy.core.cache import cacheit +import pystencils from pystencils.alignedarray import aligned_empty from pystencils.data_types import StructType, TypedSymbol, create_type from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol @@ -38,7 +41,6 @@ def fields(description=None, index_dimensions=0, layout=None, **kwargs): >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype >>> assert v.index_shape == (2,) - Format string can be left out, field names are taken from keyword arguments. >>> fields(f1=arr_s, f2=arr_s) [f1, f2] @@ -292,6 +294,10 @@ class Field(AbstractField): self.shape = shape self.strides = strides self.latex_name = None # type: Optional[str] + self.coordinate_origin = sp.Matrix(tuple( + 0 for _ in range(self.spatial_dimensions) + )) # type: tuple[float,sp.Symbol] + self.coordinate_transform = sp.eye(self.spatial_dimensions) def new_field_with_different_name(self, new_name): if self.has_fixed_shape: @@ -312,6 +318,9 @@ class Field(AbstractField): def ndim(self) -> int: return len(self.shape) + def values_per_cell(self) -> int: + return functools.reduce(operator.mul, self.index_shape, 1) + @property def layout(self): return self._layout @@ -409,6 +418,34 @@ class Field(AbstractField): return False return self.hashable_contents() == other.hashable_contents() + @property + def physical_coordinates(self): + return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions)) + + @property + def physical_coordinates_staggered(self): + return self.coordinate_transform @ \ + (self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions)) + + def index_to_physical(self, index_coordinates, staggered=False): + if staggered: + index_coordinates = sp.Matrix([i + 0.5 for i in index_coordinates]) + return self.coordinate_transform @ (self.coordinate_origin + index_coordinates) + + def physical_to_index(self, physical_coordinates, staggered=False): + rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin + if staggered: + rtn = sp.Matrix([i - 0.5 for i in rtn]) + + return rtn + + def index_to_staggered_physical_coordinates(self, symbol_vector): + symbol_vector += sp.Matrix([0.5] * self.spatial_dimensions) + return self.create_physical_coordinates(symbol_vector) + + def set_coordinate_origin_to_field_center(self): + self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape]) + # noinspection PyAttributeOutsideInit,PyUnresolvedReferences class Access(TypedSymbol, AbstractField.AbstractAccess): """Class representing a relative access into a `Field`. @@ -429,11 +466,12 @@ class Field(AbstractField): >>> central_y_component.at_index(0) # change component v_C^0 """ + def __new__(cls, name, *args, **kwargs): obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs) return obj - def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False): + def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False, dtype=None): field_name = field.name offsets_and_index = (*offsets, *idx) if idx is not None else offsets constant_offsets = not any([isinstance(o, sp.Basic) and not o.is_Integer for o in offsets_and_index]) @@ -484,7 +522,7 @@ class Field(AbstractField): return obj def __getnewargs__(self): - return self.field, self.offsets, self.index, self.is_absolute_access + return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype # noinspection SpellCheckingInspection __xnew__ = staticmethod(__new_stage2__) @@ -503,7 +541,7 @@ class Field(AbstractField): if len(idx) != self.field.index_dimensions: raise ValueError("Wrong number of indices: " "Got %d, expected %d" % (len(idx), self.field.index_dimensions)) - return Field.Access(self.field, self._offsets, idx) + return Field.Access(self.field, self._offsets, idx, dtype=self.dtype) def __getitem__(self, *idx): return self.__call__(*idx) @@ -562,7 +600,7 @@ class Field(AbstractField): """ offset_list = list(self.offsets) offset_list[coord_id] += offset - return Field.Access(self.field, tuple(offset_list), self.index) + return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype) def get_shifted(self, *shift) -> 'Field.Access': """Returns a new Access with changed spatial coordinates @@ -572,7 +610,10 @@ class Field(AbstractField): >>> f[0,0].get_shifted(1, 1) f_NE """ - return Field.Access(self.field, tuple(a + b for a, b in zip(shift, self.offsets)), self.index) + return Field.Access(self.field, + tuple(a + b for a, b in zip(shift, self.offsets)), + self.index, + dtype=self.dtype) def at_index(self, *idx_tuple) -> 'Field.Access': """Returns new Access with changed index. @@ -582,7 +623,7 @@ class Field(AbstractField): >>> f(0).at_index(8) f_C^8 """ - return Field.Access(self.field, self.offsets, idx_tuple) + return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype) @property def is_absolute_access(self) -> bool: diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index 92738e1e7df9c35dca87ff5f3de84ceeb5bcad8a..ee78b4663c4f40ebf3de0f96ebd405046a75de3c 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -1,9 +1,13 @@ +from os.path import dirname, isdir, join + import numpy as np from pystencils.backends.cbackend import generate_c, get_headers from pystencils.data_types import StructType from pystencils.field import FieldType +from pystencils.gpucuda.texture_utils import ndarray_to_tex from pystencils.include import get_pystencils_include_path +from pystencils.interpolation_astnodes import TextureAccess from pystencils.kernelparameters import FieldPointerSymbol USE_FAST_MATH = True @@ -29,17 +33,30 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen if argument_dict is None: argument_dict = {} - header_list = ['<stdint.h>'] + list(get_headers(kernel_function_node)) + header_list = ['<cstdint>', '<pycuda-helper-modified.hpp>'] + list(get_headers(kernel_function_node)) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) code = includes + "\n" code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str(generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend)) - options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] + textures = set(d.texture for d in kernel_function_node.atoms(TextureAccess)) + + nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: - options.append("-use_fast_math") - mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) + nvcc_options.append("-use_fast_math") + + if any(t.cubic_bspline_interpolation for t in textures): + assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \ + "Submodule CubicInterpolationCUDA does not exist" + nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")] + nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")] + + needed_dims = set(t.field.spatial_dimensions for t in textures if t.cubic_bspline_interpolation) + for i in needed_dims: + code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code + + mod = SourceModule(code, options=nvcc_options, include_dirs=[get_pystencils_include_path()]) func = mod.get_function(kernel_function_node.function_name) parameters = kernel_function_node.get_parameters() @@ -63,6 +80,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + # TODO: use texture objects: + # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ + for tex in textures: + tex_ref = mod.get_texref(str(tex)) + ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode, + tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer) args = _build_numpy_argument_list(parameters, full_arguments) cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique diff --git a/pystencils/gpucuda/kernelcreation.py b/pystencils/gpucuda/kernelcreation.py index ff82107000b00595dbcca6699ed42b172c324353..ed916d54b8d4434e0e38beb68dd8ecb933aff567 100644 --- a/pystencils/gpucuda/kernelcreation.py +++ b/pystencils/gpucuda/kernelcreation.py @@ -4,12 +4,18 @@ from pystencils.field import Field, FieldType from pystencils.gpucuda.cudajit import make_python_function from pystencils.gpucuda.indexing import BlockIndexing from pystencils.transformations import ( - add_types, get_base_buffer_index, get_common_shape, parse_base_pointer_info, - resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) - - -def create_cuda_kernel(assignments, function_name="kernel", type_info=None, indexing_creator=BlockIndexing, - iteration_slice=None, ghost_layers=None, skip_independence_check=False): + add_types, get_base_buffer_index, get_common_shape, implement_interpolations, + parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) + + +def create_cuda_kernel(assignments, + function_name="kernel", + type_info=None, + indexing_creator=BlockIndexing, + iteration_slice=None, + ghost_layers=None, + skip_independence_check=False, + use_textures_for_interpolation=True): fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check) all_fields = fields_read.union(fields_written) read_only_fields = set([f.name for f in fields_read - fields_written]) @@ -57,6 +63,8 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde ast = KernelFunction(block, 'gpu', 'gpucuda', make_python_function, ghost_layers, function_name) ast.global_variables.update(indexing.index_variables) + implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) + base_pointer_spec = [['spatialInner0']] base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], f.spatial_dimensions, f.index_dimensions) @@ -86,8 +94,13 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde return ast -def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel", type_info=None, - coordinate_names=('x', 'y', 'z'), indexing_creator=BlockIndexing): +def created_indexed_cuda_kernel(assignments, + index_fields, + function_name="kernel", + type_info=None, + coordinate_names=('x', 'y', 'z'), + indexing_creator=BlockIndexing, + use_textures_for_interpolation=True): fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False) all_fields = fields_read.union(fields_written) read_only_fields = set([f.name for f in fields_read - fields_written]) @@ -125,6 +138,8 @@ def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel ast = KernelFunction(function_body, 'gpu', 'gpucuda', make_python_function, None, function_name) ast.global_variables.update(indexing.index_variables) + implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) + coord_mapping = indexing.coordinates base_pointer_spec = [['spatialInner0']] base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], diff --git a/pystencils/gpucuda/texture_utils.py b/pystencils/gpucuda/texture_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d29d862929530b27f840da445467a2bae44b1bfb --- /dev/null +++ b/pystencils/gpucuda/texture_utils.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" + +from os.path import dirname, isdir, join + +import numpy as np + +try: + import pycuda.driver as cuda + from pycuda import gpuarray +except Exception: + pass + + +def pow_two_divider(n): + if n == 0: + return 0 + divider = 1 + while (n & divider) == 0: + divider <<= 1 + return divider + + +def ndarray_to_tex(tex_ref, + ndarray, + address_mode=None, + filter_mode=None, + use_normalized_coordinates=False, + read_as_integer=False): + + if address_mode is None: + address_mode = cuda.address_mode.BORDER + if filter_mode is None: + filter_mode = cuda.filter_mode.LINEAR + + if isinstance(ndarray, np.ndarray): + cu_array = cuda.np_to_array(ndarray, 'C') + elif isinstance(ndarray, gpuarray.GPUArray): + cu_array = cuda.gpuarray_to_array(ndarray, 'C') + else: + raise TypeError( + 'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray') + + cuda.TextureReference.set_array(tex_ref, cu_array) + + tex_ref.set_address_mode(0, address_mode) + if ndarray.ndim >= 2: + tex_ref.set_address_mode(1, address_mode) + if ndarray.ndim >= 3: + tex_ref.set_address_mode(2, address_mode) + tex_ref.set_filter_mode(filter_mode) + + if not use_normalized_coordinates: + tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES) + + if not read_as_integer: + tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER) + + +def prefilter_for_cubic_bspline(gpuarray): + import pycuda.autoinit # NOQA + from pycuda.compiler import SourceModule + + ndim = gpuarray.ndim + assert ndim == 2 or ndim == 3, "Only 2d or 3d supported" + assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \ + "Submodule CubicInterpolationCUDA does not exist" + nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] + nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")] + nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")] + + file_name = join(dirname(__file__), "CubicInterpolationCUDA", "code", "cubicPrefilter%iD.cu" % ndim) + with open(file_name) as file: + code = file.read() + + mod = SourceModule(code, options=nvcc_options) + + if ndim == 2: + height, width = gpuarray.shape + block = min(pow_two_divider(height), 64) + grid = height // block + func = mod.get_function('SamplesToCoefficients2DXf') + func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r) + for r in reversed(gpuarray.shape)), + block=(block, 1, 1), + grid=(grid, 1, 1)) + + block = min(pow_two_divider(width), 64) + grid = width // block + func = mod.get_function('SamplesToCoefficients2DYf') + func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r) + for r in reversed(gpuarray.shape)), + block=(block, 1, 1), + grid=(grid, 1, 1)) + elif ndim == 3: + depth, height, width = gpuarray.shape + dimX = min(min(pow_two_divider(width), pow_two_divider(height)), 64) + dimY = min(min(pow_two_divider(depth), pow_two_divider(height)), 512 / dimX) + block = (dimX, dimY, 1) + + dimGridX = (height // block[0], depth // block[1], 1) + dimGridY = (width // block[0], depth // block[1], 1) + dimGridZ = (width // block[0], height // block[1], 1) + + func = mod.get_function("SamplesToCoefficients3DXf") + func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r) + for r in reversed(gpuarray.shape)), + block=block, + grid=dimGridX) + func = mod.get_function("SamplesToCoefficients3DYf") + func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r) + for r in reversed(gpuarray.shape)), + block=block, + grid=dimGridY) + func = mod.get_function("SamplesToCoefficients3DZf") + func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r) + for r in reversed(gpuarray.shape)), + block=block, + grid=dimGridZ) diff --git a/pystencils/include/pycuda-helper-modified.hpp b/pystencils/include/pycuda-helper-modified.hpp new file mode 100644 index 0000000000000000000000000000000000000000..53b3797016b73779c1a6beba09f91d8ac9fe8dfc --- /dev/null +++ b/pystencils/include/pycuda-helper-modified.hpp @@ -0,0 +1,249 @@ +#include <pycuda-complex.hpp> +#include <surface_functions.h> +#ifndef _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP +#define _AFJKDASLFSADHF_HEADER_SEEN_PYCUDA_HELPERS_HPP + +extern "C++" { + // "double-precision" textures ------------------------------------------------ + /* Thanks to Nathan Bell <nbell@nvidia.com> for help in figuring this out. */ + + typedef float fp_tex_float; + typedef int2 fp_tex_double; + typedef uint2 fp_tex_cfloat; + typedef int4 fp_tex_cdouble; + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<float> fp_tex1Dfetch(texture<fp_tex_cfloat, 1, read_mode> tex, float i) + { + fp_tex_cfloat v = tex1Dfetch(tex, i); + pycuda::complex<float> out; + return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<double> fp_tex1Dfetch(texture<fp_tex_cdouble, 1, read_mode> tex, float i) + { + fp_tex_cdouble v = tex1Dfetch(tex, i); + return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } + + template <enum cudaTextureReadMode read_mode> + __device__ double fp_tex1Dfetch(texture<fp_tex_double, 1, read_mode> tex, float i) + { + fp_tex_double v = tex1Dfetch(tex, i); + return __hiloint2double(v.y, v.x); + } + +// 2D functionality + + template <enum cudaTextureReadMode read_mode> + __device__ double fp_tex2D(texture<fp_tex_double, 2, read_mode> tex, float i, float j) + { + fp_tex_double v = tex2D(tex, i, j); + return __hiloint2double(v.y, v.x); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<float> fp_tex2D(texture<fp_tex_cfloat, 2, read_mode> tex, float i, float j) + { + fp_tex_cfloat v = tex2D(tex, i, j); + return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<double> fp_tex2D(texture<fp_tex_cdouble, 2, read_mode> tex, float i, float j) + { + fp_tex_cdouble v = tex2D(tex, i, j); + return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } + // 2D Layered extension + + template <enum cudaTextureReadMode read_mode> + __device__ double fp_tex2DLayered(texture<fp_tex_double, cudaTextureType2DLayered, read_mode> tex, float i, float j, int layer) + { + fp_tex_double v = tex2DLayered(tex, i, j, layer); + return __hiloint2double(v.y, v.x); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<float> fp_tex2DLayered(texture<fp_tex_cfloat, cudaTextureType2DLayered, read_mode> tex, float i, float j, int layer) + { + fp_tex_cfloat v = tex2DLayered(tex, i, j, layer); + return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<double> fp_tex2DLayered(texture<fp_tex_cdouble, cudaTextureType2DLayered, read_mode> tex, float i, float j, int layer) + { + fp_tex_cdouble v = tex2DLayered(tex, i, j, layer); + return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } + + // 3D functionality + + template <enum cudaTextureReadMode read_mode> + __device__ double fp_tex3D(texture<fp_tex_double, 3, read_mode> tex, float i, float j, float k) + { + fp_tex_double v = tex3D(tex, i, j, k); + return __hiloint2double(v.y, v.x); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<float> fp_tex3D(texture<fp_tex_cfloat, 3, read_mode> tex, float i, float j, float k) + { + fp_tex_cfloat v = tex3D(tex, i, j, k); + return pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + template <enum cudaTextureReadMode read_mode> + __device__ pycuda::complex<double> fp_tex3D(texture<fp_tex_cdouble, 3, read_mode> tex, float i, float j, float k) + { + fp_tex_cdouble v = tex3D(tex, i, j, k); + return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } + + // FP_Surfaces with complex supprt + + __device__ void fp_surf2DLayeredwrite(double var,surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_double auxvar; + auxvar.x = __double2loint(var); + auxvar.y = __double2hiint(var); + surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_double), j, layer, mode); + } + + __device__ void fp_surf2DLayeredwrite(pycuda::complex<float> var,surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cfloat auxvar; + auxvar.x = __float_as_int(var._M_re); + auxvar.y = __float_as_int(var._M_im); + surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_cfloat), j, layer,mode); + } + + __device__ void fp_surf2DLayeredwrite(pycuda::complex<double> var,surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cdouble auxvar; + auxvar.x = __double2loint(var._M_re); + auxvar.y = __double2hiint(var._M_re); + + auxvar.z = __double2loint(var._M_im); + auxvar.w = __double2hiint(var._M_im); + surf2DLayeredwrite(auxvar, surf, i*sizeof(fp_tex_cdouble), j, layer,mode); + } + + __device__ void fp_surf3Dwrite(double var,surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_double auxvar; + auxvar.x = __double2loint(var); + auxvar.y = __double2hiint(var); + surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_double), j, k,mode); + } + + __device__ void fp_surf3Dwrite(pycuda::complex<float> var,surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cfloat auxvar; + auxvar.x = __float_as_int(var._M_re); + auxvar.y = __float_as_int(var._M_im); + + surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_cfloat), j, k, mode); + } + + __device__ void fp_surf3Dwrite(pycuda::complex<double> var,surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cdouble auxvar; + auxvar.x = __double2loint(var._M_re); + auxvar.y = __double2hiint(var._M_re); + + auxvar.z = __double2loint(var._M_im); + auxvar.w = __double2hiint(var._M_im); + surf3Dwrite(auxvar, surf, i*sizeof(fp_tex_cdouble), j, k, mode); + } + + __device__ void fp_surf2DLayeredread(double *var, surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_double v; + surf2DLayeredread(&v, surf, i*sizeof(fp_tex_double), j, layer, mode); + *var = __hiloint2double(v.y, v.x); + } + + __device__ void fp_surf2DLayeredread(pycuda::complex<float> *var, surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cfloat v; + surf2DLayeredread(&v, surf, i*sizeof(fp_tex_cfloat), j, layer, mode); + *var = pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + __device__ void fp_surf2DLayeredread(pycuda::complex<double> *var, surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cdouble v; + surf2DLayeredread(&v, surf, i*sizeof(fp_tex_cdouble), j, layer, mode); + *var = pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } + + __device__ void fp_surf3Dread(double *var, surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_double v; + surf3Dread(&v, surf, i*sizeof(fp_tex_double), j, k, mode); + *var = __hiloint2double(v.y, v.x); + } + + __device__ void fp_surf3Dread(pycuda::complex<float> *var, surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cfloat v; + surf3Dread(&v, surf, i*sizeof(fp_tex_cfloat), j, k, mode); + *var = pycuda::complex<float>(__int_as_float(v.x), __int_as_float(v.y)); + } + + __device__ void fp_surf3Dread(pycuda::complex<double> *var, surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) + { + fp_tex_cdouble v; + surf3Dread(&v, surf, i*sizeof(fp_tex_cdouble), j, k, mode); + *var = pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); + } +#define PYCUDA_GENERATE_FP_TEX_FUNCS(TYPE) \ + template <enum cudaTextureReadMode read_mode> \ + __device__ TYPE fp_tex1Dfetch(texture<TYPE, 1, read_mode> tex, float i) \ + { \ + return tex1Dfetch(tex, i); \ + } \ + template <enum cudaTextureReadMode read_mode> \ + __device__ TYPE fp_tex2D(texture<TYPE, 2, read_mode> tex, float i, float j) \ + { \ + return tex2D(tex, i, j); \ + } \ + template <enum cudaTextureReadMode read_mode> \ + __device__ TYPE fp_tex2DLayered(texture<TYPE, cudaTextureType2DLayered, read_mode> tex, float i, float j, int layer) \ + { \ + return tex2DLayered(tex, i, j, layer); \ + } \ + template <enum cudaTextureReadMode read_mode> \ + __device__ TYPE fp_tex3D(texture<TYPE, 3, read_mode> tex, float i, float j, float k) \ + { \ + return tex3D(tex, i, j, k); \ + } \ + __device__ void fp_surf2DLayeredwrite(TYPE var,surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer,enum cudaSurfaceBoundaryMode mode) \ + { \ + surf2DLayeredwrite(var, surf, i*sizeof(TYPE), j, layer, mode); \ + } \ + __device__ void fp_surf2DLayeredread(TYPE *var, surface<void, cudaSurfaceType2DLayered> surf, float i, float j, int layer,enum cudaSurfaceBoundaryMode mode) \ + { \ + surf2DLayeredread(var, surf, i*sizeof(TYPE), j, layer, mode); \ + } \ + __device__ void fp_surf3Dwrite(TYPE var,surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) \ + { \ + surf3Dwrite(var, surf, i*sizeof(TYPE), j, k, mode); \ + } \ + __device__ void fp_surf3Dread(TYPE *var, surface<void, 3> surf, float i, float j, float k, enum cudaSurfaceBoundaryMode mode) \ + { \ + surf3Dread(var, surf, i*sizeof(TYPE), j, k, mode); \ + } + PYCUDA_GENERATE_FP_TEX_FUNCS(float) + PYCUDA_GENERATE_FP_TEX_FUNCS(int) + PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned int) + PYCUDA_GENERATE_FP_TEX_FUNCS(short int) + PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned short int) + PYCUDA_GENERATE_FP_TEX_FUNCS(char) + PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned char) +} + +#endif diff --git a/pystencils/interpolation_astnodes.py b/pystencils/interpolation_astnodes.py new file mode 100644 index 0000000000000000000000000000000000000000..b18c9c40e32c25668ef9d70fd6921f354bc1c261 --- /dev/null +++ b/pystencils/interpolation_astnodes.py @@ -0,0 +1,333 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" + +import itertools +from typing import Set + +import sympy as sp +from sympy.core.cache import cacheit + +import pystencils +from pystencils.astnodes import Node +from pystencils.data_types import TypedSymbol, cast_func, create_type + +try: + import pycuda.driver +except Exception: + pass + + +class LinearInterpolator(object): + """ + Implements non-integer accesses on fields using linear interpolation. + + On GPU, this interpolator can be implemented by a `TextureCachedField` for hardware acceleration. + + Address modes are different boundary handlings possible choices are like for CUDA textures + + **CLAMP** + + The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M. + + **BORDER** + + The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M. + + Now, to describe the last two address modes, we are forced to consider normalized coordinates, + so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1. + + **WRAP** + + The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M. + In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p. + + **MIRROR** + + The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2. + In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0. + + Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures + """ + + required_global_declarations = [] + + def __init__(self, parent_field: pystencils.Field, + address_mode='BORDER', + use_normalized_coordinates=False): + super().__init__() + + self.field = parent_field.new_field_with_different_name(parent_field.name) + self.field.field_type = pystencils.field.FieldType.CUSTOM + self.address_mode = address_mode + self.use_normalized_coordinates = use_normalized_coordinates + hash_str = "%x" % abs(hash(self.field) + hash(address_mode)) + self.symbol = TypedSymbol('dummy_symbol_carrying_field' + self.field.name + hash_str, + 'dummy_symbol_carrying_field' + self.field.name + hash_str) + self.symbol.field = self.field + self.symbol.interpolator = self + + def at(self, offset): + return InterpolatorAccess(self.symbol, *offset) + + def __getitem__(self, offset): + return InterpolatorAccess(self.symbol, *offset) + + def __str__(self): + return '%s_interpolator' % self.field.name + + def __repr__(self): + return self.__str__() + + def __hash__(self): + return hash((str(self.address_mode), + str(type(self)), + self.symbol, + self.address_mode, + self.use_normalized_coordinates)) + + +class InterpolatorAccess(TypedSymbol): + def __new__(cls, field, offsets, *args, **kwargs): + obj = TextureAccess.__xnew_cached_(cls, field, offsets, *args, **kwargs) + return obj + + def __new_stage2__(self, symbol, *offsets): + assert offsets is not None + obj = super().__xnew__(self, '%s_interpolator_%x' % + (symbol.field.name, abs(hash(tuple(offsets)))), symbol.field.dtype) + obj.offsets = offsets + obj.symbol = symbol + obj.field = symbol.field + obj.interpolator = symbol.interpolator + return obj + + def __hash__(self): + return hash((self.symbol, self.field, tuple(self.offsets), self.interpolator)) + + def __str__(self): + return '%s_interpolator(%s)' % (self.field.name, ','.join(str(o) for o in self.offsets)) + + def __repr__(self): + return self.__str__() + + def atoms(self, *types): + if self.offsets: + offsets = set(o for o in self.offsets if isinstance(o, types)) + if isinstance(self, *types): + offsets.update([self]) + for o in self.offsets: + if hasattr(o, 'atoms'): + offsets.update(set(o.atoms(types))) + return offsets + else: + return set() + + @property + def free_symbols(self): + symbols = set() + if self.offsets is not None: + for o in self.offsets: + if hasattr(o, 'free_symbols'): + symbols.update(set(o.free_symbols)) + # if hasattr(o, 'atoms'): + # symbols.update(set(o.atoms(sp.Symbol))) + + return symbols + + @property + def args(self): + return [self.symbol, *self.offsets] + + @property + def symbols_defined(self) -> Set[sp.Symbol]: + return {self} + + def implementation_with_stencils(self): + field = self.field + + default_int_type = create_type('uint32') + use_textures = isinstance(self, TextureAccess) + if use_textures: + def absolute_access(x, _): + return self.texture.at((o for o in x)) + else: + absolute_access = field.absolute_access + + sum = [0, ] * (field.shape[0] if field.index_dimensions else 1) + + offsets = self.offsets + rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1) + + # TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/ + for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions): + weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)]) + index = [f(offset) for (f, offset) in zip(c, offsets)] + for channel_idx in range(field.shape[0] if field.index_dimensions else 1): + # Hardware boundary handling on GPU + if use_textures: + weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)]) + sum[channel_idx] += \ + weight * absolute_access(index, channel_idx if field.index_dimensions else ()) + # else boundary handling using software + elif str(self.interpolator.address_mode).lower() == 'border': + is_inside_field = sp.And( + *itertools.chain([i >= 0 for i in index], + [idx < field.shape[dim] for (dim, idx) in enumerate(index)])) + index = [cast_func(i, default_int_type) for i in index] + sum[channel_idx] += sp.Piecewise( + (weight * absolute_access(index, channel_idx if field.index_dimensions else ()), + is_inside_field), + (sp.simplify(0), True) + ) + elif str(self.interpolator.address_mode).lower() == 'clamp': + index = [cast_func(sp.Min(sp.Max(0, i), field.shape[dim] - 1), default_int_type) + for (dim, i) in enumerate(index)] + sum[channel_idx] += weight * \ + absolute_access(index, channel_idx if field.index_dimensions else ()) + elif str(self.interpolator.address_mode).lower() == 'wrap': + index = [cast_func(sp.Piecewise((sp.Mod(i, field.shape[dim]), i >= 0), + (field.shape[dim] + sp.Mod(i, field.shape[dim]), True)), + default_int_type) + for (dim, i) in enumerate(index)] + sum[channel_idx] += weight * \ + absolute_access(index, channel_idx if field.index_dimensions else ()) + elif str(self.interpolator.address_mode).lower() == 'mirror': + def triangle_fun(x, half_period): + saw_tooth = sp.Abs(x) % (2 * half_period) + return sp.Piecewise((saw_tooth, saw_tooth < half_period), + (2 * half_period - 1 - saw_tooth, True)) + index = [cast_func(triangle_fun(i, field.shape[dim]), + default_int_type) for (dim, i) in enumerate(index)] + sum[channel_idx] += weight * absolute_access(index, channel_idx if field.index_dimensions else ()) + else: + raise NotImplementedError() + + sum = [sp.factor(s) for s in sum] + + if field.index_dimensions: + return sp.Matrix(sum) + else: + return sum[0] + + # noinspection SpellCheckingInspection + __xnew__ = staticmethod(__new_stage2__) + # noinspection SpellCheckingInspection + __xnew_cached_ = staticmethod(cacheit(__new_stage2__)) + +########################################################################################## +# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols # +########################################################################################## + + +class TextureCachedField: + + def __init__(self, parent_field, + address_mode=None, + filter_mode=None, + use_normalized_coordinates=False, + read_as_integer=False, + cubic_bspline_interpolation=False): + if isinstance(address_mode, str): + address_mode = getattr(pycuda.driver.address_mode, address_mode.upper()) + + if address_mode is None: + address_mode = pycuda.driver.address_mode.BORDER + if filter_mode is None: + filter_mode = pycuda.driver.filter_mode.LINEAR + + # self, field_name, field_type, dtype, layout, shape, strides + self.field = parent_field + self.required_global_declarations = [TextureDeclaration(self)] + self.address_mode = address_mode + self.filter_mode = filter_mode + self.read_as_integer = read_as_integer + self.use_normalized_coordinates = use_normalized_coordinates + self.symbol = TypedSymbol(str(self), self.field.dtype.numpy_dtype) + self.symbol.interpolator = self + self.symbol.field = self.field + self.cubic_bspline_interpolation = cubic_bspline_interpolation + + # assert str(self.field.dtype) != 'double', "CUDA does not support double textures!" + # assert dtype_supports_textures(self.field.dtype), "CUDA only supports texture types with 32 bits or less" + + @classmethod + def from_interpolator(cls, interpolator: LinearInterpolator): + obj = cls(interpolator.field, interpolator.address_mode) + return obj + + def at(self, offset): + return TextureAccess(self.symbol, *offset) + + def __getitem__(self, offset): + return TextureAccess(self.symbol, *offset) + + def __str__(self): + return '%s_texture_%x' % (self.field.name, abs(hash(self.field) + hash(str(self.address_mode)))) + + def __repr__(self): + return self.__str__() + + +class TextureAccess(InterpolatorAccess): + def __new__(cls, texture_symbol, offsets, *args, **kwargs): + obj = TextureAccess.__xnew_cached_(cls, texture_symbol, offsets, *args, **kwargs) + return obj + + def __new_stage2__(self, symbol, *offsets): + obj = super().__xnew__(self, symbol, *offsets) + obj.required_global_declarations = symbol.interpolator.required_global_declarations + obj.required_global_declarations[0]._symbols_defined.add(obj) + return obj + + def __str__(self): + return '%s_texture(%s)' % (self.interpolator.field.name, ','.join(str(o) for o in self.offsets)) + + @property + def texture(self): + return self.interpolator + + # noinspection SpellCheckingInspection + __xnew__ = staticmethod(__new_stage2__) + # noinspection SpellCheckingInspection + __xnew_cached_ = staticmethod(cacheit(__new_stage2__)) + + +class TextureDeclaration(Node): + + def __init__(self, parent_texture): + self.texture = parent_texture + self._symbols_defined = set() + + @property + def symbols_defined(self) -> Set[sp.Symbol]: + return self._symbols_defined + + @property + def args(self) -> Set[sp.Symbol]: + return set() + + +def dtype_supports_textures(dtype): + """ + Returns whether CUDA supports texture fetches with this numpy dtype. + + The maximum word size for a texture fetch is four bytes. + + With this trick also larger dtypes can be fetched: + https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp + + """ + if hasattr(dtype, 'numpy_dtype'): + dtype = dtype.numpy_dtype + + if isinstance(dtype, type): + return dtype().itemsize <= 4 + + return dtype.itemsize <= 4 diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py index ade980f55969a4005f2c5055ad27f861ab5524de..866f1a4ef5357e6a1a74d21b174d24785c398677 100644 --- a/pystencils/kernelcreation.py +++ b/pystencils/kernelcreation.py @@ -12,10 +12,18 @@ from pystencils.transformations import ( loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel) -def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None, +def create_kernel(assignments, + target='cpu', + data_type="double", + iteration_slice=None, + ghost_layers=None, skip_independence_check=False, - cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None, - gpu_indexing='block', gpu_indexing_params=MappingProxyType({})): + cpu_openmp=False, + cpu_vectorize_info=None, + cpu_blocking=None, + gpu_indexing='block', + gpu_indexing_params=MappingProxyType({}), + use_textures_for_interpolation=True): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. @@ -99,14 +107,22 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice ast = create_cuda_kernel(assignments, type_info=data_type, indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params), iteration_slice=iteration_slice, ghost_layers=ghost_layers, - skip_independence_check=skip_independence_check) + skip_independence_check=skip_independence_check, + use_textures_for_interpolation=use_textures_for_interpolation) return ast else: raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,)) -def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'), - cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})): +def create_indexed_kernel(assignments, + index_fields, + target='cpu', + data_type="double", + coordinate_names=('x', 'y', 'z'), + cpu_openmp=True, + gpu_indexing='block', + gpu_indexing_params=MappingProxyType({}), + use_textures_for_interpolation=True): """ Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. @@ -159,8 +175,12 @@ def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="do elif target == 'gpu': from pystencils.gpucuda import created_indexed_cuda_kernel idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params) - ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type, - coordinate_names=coordinate_names, indexing_creator=idx_creator) + ast = created_indexed_cuda_kernel(assignments, + index_fields, + type_info=data_type, + coordinate_names=coordinate_names, + indexing_creator=idx_creator, + use_textures_for_interpolation=use_textures_for_interpolation) return ast else: raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,)) diff --git a/pystencils/spatial_coordinates.py b/pystencils/spatial_coordinates.py new file mode 100644 index 0000000000000000000000000000000000000000..6c3ba4db7d98b50bce9442619fbbce07782004a1 --- /dev/null +++ b/pystencils/spatial_coordinates.py @@ -0,0 +1,18 @@ + +import sympy + +import pystencils +import pystencils.astnodes + +x_, y_, z_ = tuple(pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(3)) +x_staggered, y_staggered, z_staggered = x_ + 0.5, y_ + 0.5, z_ + 0.5 + + +def x_vector(ndim): + return sympy.Matrix(tuple(pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(ndim))) + + +def x_staggered_vector(ndim): + return sympy.Matrix(tuple( + pystencils.astnodes.LoopOverCoordinate.get_loop_counter_symbol(i) + 0.5 for i in range(ndim) + )) diff --git a/pystencils/test_gettypeofexpression.py b/pystencils/test_gettypeofexpression.py new file mode 100644 index 0000000000000000000000000000000000000000..fa480409efc36a91fddfa8d7aff74d745d3624fd --- /dev/null +++ b/pystencils/test_gettypeofexpression.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +import sympy as sp + +from pystencils.data_types import create_type, get_type_of_expression + + +def test_gettypeofexpression(): + random_constant = sp.sqrt(sp.pi / 5) + type = get_type_of_expression(random_constant) + type == create_type('float64') + + random_constant = sp.pi + type = get_type_of_expression(random_constant) + type == create_type('float64') + + random_constant = sp.pi / sp.pi + type = get_type_of_expression(random_constant) + type == create_type('int64') + + +def main(): + test_gettypeofexpression() + + +if __name__ == '__main__': + main() diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 60a22d812578db85d7f375a74144352e431ffc62..e38027d2f4c7e7fb9eb40b88f48fc394f43fb8d4 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -9,11 +9,13 @@ import sympy as sp from sympy.logic.boolalg import Boolean import pystencils.astnodes as ast +import pystencils.integer_functions from pystencils.assignment import Assignment from pystencils.data_types import ( PointerType, StructType, TypedSymbol, cast_func, collate_types, create_type, get_base_type, get_type_of_expression, pointer_arithmetic_func, reinterpret_cast_func) from pystencils.field import AbstractField, Field, FieldType +from pystencils.interpolation_astnodes import InterpolatorAccess, TextureAccess, TextureCachedField from pystencils.kernelparameters import FieldPointerSymbol from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.slicing import normalize_slice @@ -503,7 +505,10 @@ def resolve_field_accesses(ast_node, read_only_field_names=set(), if isinstance(expr, ast.ResolvedFieldAccess): return expr - new_args = [visit_sympy_expr(e, enclosing_block, sympy_assignment) for e in expr.args] + if hasattr(expr, 'args'): + new_args = [visit_sympy_expr(e, enclosing_block, sympy_assignment) for e in expr.args] + else: + new_args = [] kwargs = {'evaluate': False} if type(expr) in (sp.Add, sp.Mul, sp.Piecewise) else {} return expr.func(*new_args, **kwargs) if new_args else expr @@ -521,7 +526,9 @@ def resolve_field_accesses(ast_node, read_only_field_names=set(), if sub_ast.false_block: visit_node(sub_ast.false_block) else: - for i, a in enumerate(sub_ast.args): + if isinstance(sub_ast, (bool, int, float)): + return + for a in sub_ast.args: visit_node(a) return visit_node(ast_node) @@ -798,12 +805,26 @@ class KernelConstraintsCheck: self.fields_read.add(rhs.field) self.fields_read.update(rhs.indirect_addressing_fields) return rhs + elif isinstance(rhs, InterpolatorAccess): + new_args = [self.process_expression(arg, type_constants) for arg in rhs.offsets] + if new_args: + rhs.offsets = new_args + return rhs elif isinstance(rhs, TypedSymbol): return rhs elif isinstance(rhs, sp.Symbol): return TypedSymbol(rhs.name, self._type_for_symbol[rhs.name]) elif type_constants and isinstance(rhs, sp.Number): return cast_func(rhs, create_type(self._type_for_symbol['_constant'])) + elif isinstance(rhs, sp.boolalg.BooleanFunction) or \ + type(rhs) in pystencils.integer_functions.__dict__.values(): + new_args = [self.process_expression(a, type_constants) for a in rhs.args] + types_of_expressions = [get_type_of_expression(a) for a in new_args] + arg_type = collate_types(types_of_expressions, forbid_collation_to_float=True) + new_args = [a if not hasattr(a, 'dtype') or a.dtype == arg_type + else cast_func(a, arg_type) + for a in new_args] + return rhs.func(*new_args) elif isinstance(rhs, sp.Mul): new_args = [self.process_expression(arg, type_constants) if arg not in (-1, 1) else arg for arg in rhs.args] return rhs.func(*new_args) if new_args else rhs @@ -955,7 +976,8 @@ def insert_casts(node): # TODO optimize pow, don't cast integer on double types = [get_type_of_expression(arg) for arg in args] assert len(types) > 0 - target = collate_types(types) + # Never ever, ever collate to float type for boolean functions! + target = collate_types(types, forbid_collation_to_float=isinstance(node.func, sp.boolalg.BooleanFunction)) zipped = list(zip(args, types)) if target.func is PointerType: assert node.func is sp.Add @@ -1150,7 +1172,6 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None: def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: """Blocking of loops to enhance cache locality. Modifies the ast node in-place. - Args: ast_node: kernel function node before vectorization transformation has been applied block_size: sequence defining block size in x, y, (z) direction @@ -1195,3 +1216,48 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: inner_loop.start = block_ctr inner_loop.stop = stop return len(coordinates) + + +def implement_interpolations(ast_node: ast.Node, + implement_by_texture_accesses: bool = False, + vectorize: bool = False, + use_hardware_interpolation_for_f32=True): + # TODO: perform this function on assignments, when unify_shape_symbols allows differently sized fields + + assert not(implement_by_texture_accesses and vectorize), \ + "can only implement interpolations either by texture accesses or CPU vectorization" + FLOAT32_T = create_type('float32') + + interpolation_accesses = ast_node.atoms(InterpolatorAccess) + + def can_use_hw_interpolation(i): + return use_hardware_interpolation_for_f32 and i.dtype == FLOAT32_T and isinstance(i, TextureAccess) + + if implement_by_texture_accesses: + + interpolators = {a.symbol.interpolator for a in interpolation_accesses} + to_texture_map = {i: TextureCachedField.from_interpolator(i) for i in interpolators} + + substitutions = {i: to_texture_map[i.symbol.interpolator].at( + [o for o in i.offsets]) for i in interpolation_accesses} + + import pycuda.driver as cuda + for texture in substitutions.values(): + if can_use_hw_interpolation(texture): + texture.filter_mode = cuda.filter_mode.LINEAR + else: + texture.filter_mode = cuda.filter_mode.POINT + texture.read_as_integer = True + + ast_node.subs(substitutions) + + # Update after replacements + interpolation_accesses = ast_node.atoms(InterpolatorAccess) + + if vectorize: + # TODO can be done in _interpolator_access_to_stencils field.absolute_access == simd_gather + raise NotImplementedError() + else: + substitutions = {i: i.implementation_with_stencils() + for i in interpolation_accesses if not can_use_hw_interpolation(i)} + ast_node.subs(substitutions) diff --git a/pystencils_tests/test_cuda_known_functions.py b/pystencils_tests/test_cuda_known_functions.py index 5742a3262806d60024824994d2baf28ae6f2d8ec..ce0bdfea47f368b466b42dd308f0443303cb6ea5 100644 --- a/pystencils_tests/test_cuda_known_functions.py +++ b/pystencils_tests/test_cuda_known_functions.py @@ -1,7 +1,7 @@ import sympy import pystencils -from pystencils.astnodes import get_dummy_symbol +from pystencils.data_types import get_dummy_symbol from pystencils.backends.cuda_backend import CudaSympyPrinter from pystencils.data_types import address_of diff --git a/pystencils_tests/test_cudasympyprinter.py b/pystencils_tests/test_cudasympyprinter.py new file mode 100644 index 0000000000000000000000000000000000000000..08c14aa79b7ca5a699b476e5455eb4356ebedd1c --- /dev/null +++ b/pystencils_tests/test_cudasympyprinter.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +import sympy as sp + +import pystencils + + +def test_cudasympyprinter(): + + for use_single_precision in (False, True): + x, y = pystencils.fields('x,y: float%s [2d]' % ('32' if use_single_precision else '64')) + a, b, c = sp.symbols('a,b,c') + + random_constant_with_integer = sp.sqrt(sp.pi / 5) + + assignments = pystencils.AssignmentCollection({ + a: 1 / sp.sqrt(x[1, 1]), + b: random_constant_with_integer, + y.center(): sp.sqrt(x.center()) * (x[-1, 0] ** x[+1, 0]) % (x[-1, 0] ** x[+1, 0]) + }) + print(assignments) + ast = pystencils.create_kernel(assignments, target='gpu') + print(ast) + code = str(pystencils.show_code(ast)) + print(code) + + if use_single_precision: + assert 'fmodf(' in code + assert 'sqrtf(' in code + assert 'powf(' in code + else: + assert 'fmod(' in code + assert 'sqrt(' in code + assert 'pow(' in code + + +def main(): + test_cudasympyprinter() + + +if __name__ == '__main__': + main() diff --git a/pystencils_tests/test_data/lenna.png b/pystencils_tests/test_data/lenna.png new file mode 100644 index 0000000000000000000000000000000000000000..59ef68aabd033341028ccd2b1f6c5871c02b9d7b Binary files /dev/null and b/pystencils_tests/test_data/lenna.png differ diff --git a/pystencils_tests/test_data/test_vessel2d_mask.png b/pystencils_tests/test_data/test_vessel2d_mask.png new file mode 100644 index 0000000000000000000000000000000000000000..1e86b699b10cd8749f899a7600cd0a46fbe98b7b Binary files /dev/null and b/pystencils_tests/test_data/test_vessel2d_mask.png differ diff --git a/pystencils_tests/test_field_coordinates.py b/pystencils_tests/test_field_coordinates.py new file mode 100644 index 0000000000000000000000000000000000000000..1aeed343bc0c77d6e6b56b3d48e68c100be3f56c --- /dev/null +++ b/pystencils_tests/test_field_coordinates.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +from os.path import dirname, join + +import numpy as np +import sympy + +import pystencils +from pystencils.interpolation_astnodes import LinearInterpolator + +try: + import pyconrad.autoinit +except Exception: + import unittest.mock + pyconrad = unittest.mock.MagicMock() + +LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png') + +try: + import skimage.io + lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float32) +except Exception: + lenna = np.random.rand(20, 30) + + +def test_rotate_center(): + x, y = pystencils.fields('x, y: float32[2d]') + + # Rotate around center when setting coordindates origins to field centers + x.set_coordinate_origin_to_field_center() + y.set_coordinate_origin_to_field_center() + + rotation_angle = sympy.pi / 5 + transform_matrix = sympy.rot_axis3(rotation_angle)[:2, :2] + + # Generic matrix transform works like that (for rotation it would be more clever to use transform_matrix.T) + inverse_matrix = transform_matrix.inv() + input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates) + + assignments = pystencils.AssignmentCollection({ + y.center(): LinearInterpolator(x).at(input_coordinate) + }) + + kernel = pystencils.create_kernel(assignments).compile() + rotated = np.zeros_like(lenna) + + kernel(x=lenna, y=rotated) + + pyconrad.imshow(lenna, "lenna") + pyconrad.imshow(rotated, "rotated") + + # If distance in input field is twice as close we will see a smaller image + x.coordinate_transform /= 2 + + input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates) + + assignments = pystencils.AssignmentCollection({ + y.center(): LinearInterpolator(x).at(input_coordinate) + }) + + kernel = pystencils.create_kernel(assignments).compile() + rotated = np.zeros_like(lenna) + + kernel(x=lenna, y=rotated) + + pyconrad.imshow(rotated, "rotated smaller") + + # Conversely, if output field has samples 3 times closer we will see a bigger image + y.coordinate_transform /= 3 + + input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates) + + assignments = pystencils.AssignmentCollection({ + y.center(): LinearInterpolator(x).at(input_coordinate) + }) + + kernel = pystencils.create_kernel(assignments).compile() + rotated = np.zeros_like(lenna) + + kernel(x=lenna, y=rotated) + + pyconrad.imshow(rotated, "rotated bigger") + + # coordinate_transform can be any matrix, also with symbols as entries + + +def main(): + test_rotate_center() + + +if __name__ == '__main__': + main() diff --git a/pystencils_tests/test_interpolation.py b/pystencils_tests/test_interpolation.py new file mode 100644 index 0000000000000000000000000000000000000000..64ec853d45b6315d666be267ff1a821b5db6c65b --- /dev/null +++ b/pystencils_tests/test_interpolation.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de> +# +# Distributed under terms of the GPLv3 license. + +""" + +""" +from os.path import dirname, join + +import numpy as np +import sympy + +import pycuda.autoinit # NOQA +import pycuda.gpuarray as gpuarray +import pystencils +from pystencils.interpolation_astnodes import LinearInterpolator +from pystencils.spatial_coordinates import x_, y_ + +type_map = { + np.float32: 'float32', + np.float64: 'float64', + np.int32: 'int32', +} + +try: + import pyconrad.autoinit +except Exception: + import unittest.mock + pyconrad = unittest.mock.MagicMock() + +LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png') + +try: + import skimage.io + lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float64) + pyconrad.imshow(lenna) +except Exception: + lenna = np.random.rand(20, 30) + + +def test_interpolation(): + x_f, y_f = pystencils.fields('x,y: float64 [2d]') + + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f).at([x_ + 2.7, y_ + 7.2]) + }) + print(assignments) + ast = pystencils.create_kernel(assignments) + print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + pyconrad.imshow(lenna) + + out = np.zeros_like(lenna) + kernel(x=lenna, y=out) + pyconrad.imshow(out, "out") + + +def test_scale_interpolation(): + x_f, y_f = pystencils.fields('x,y: float64 [2d]') + + for address_mode in ['border', 'wrap', 'clamp', 'mirror']: + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at([0.5 * x_ + 2.7, 0.25 * y_ + 7.2]) + }) + print(assignments) + ast = pystencils.create_kernel(assignments) + print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + out = np.zeros_like(lenna) + kernel(x=lenna, y=out) + pyconrad.imshow(out, "out " + address_mode) + + +def test_rotate_interpolation(): + x_f, y_f = pystencils.fields('x,y: float64 [2d]') + + rotation_angle = sympy.pi / 5 + + for address_mode in ['border', 'wrap', 'clamp', 'mirror']: + + transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed) + }) + print(assignments) + ast = pystencils.create_kernel(assignments) + print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + out = np.zeros_like(lenna) + kernel(x=lenna, y=out) + pyconrad.imshow(out, "out " + address_mode) + + +def test_rotate_interpolation_gpu(): + + rotation_angle = sympy.pi / 5 + scale = 1 + + for address_mode in ['border', 'wrap', 'clamp', 'mirror']: + previous_result = None + for dtype in [np.int32, np.float32, np.float64]: + if dtype == np.int32: + lenna_gpu = gpuarray.to_gpu( + np.ascontiguousarray(lenna * 255, dtype)) + else: + lenna_gpu = gpuarray.to_gpu( + np.ascontiguousarray(lenna, dtype)) + for use_textures in [True, False]: + x_f, y_f = pystencils.fields('x,y: %s [2d]' % type_map[dtype], ghost_layers=0) + + transformed = scale * \ + sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) - sympy.Matrix([2, 2]) + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed) + }) + print(assignments) + ast = pystencils.create_kernel(assignments, target='gpu', use_textures_for_interpolation=use_textures) + print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + out = gpuarray.zeros_like(lenna_gpu) + kernel(x=lenna_gpu, y=out) + pyconrad.imshow(out, + f"out {address_mode} texture:{use_textures} {type_map[dtype]}") + skimage.io.imsave(f"/tmp/out {address_mode} texture:{use_textures} {type_map[dtype]}.tif", + np.ascontiguousarray(out.get(), np.float32)) + if previous_result is not None: + try: + assert np.allclose(previous_result[4:-4, 4:-4], out.get()[4:-4, 4:-4], rtol=100, atol=1e-3) + except AssertionError as e: # NOQA + print("Max error: %f" % np.max(previous_result - out.get())) + # pyconrad.imshow(previous_result - out.get(), "Difference image") + # raise e + previous_result = out.get() + + +def test_shift_interpolation_gpu(): + + rotation_angle = 0 # sympy.pi / 5 + scale = 1 + # shift = - sympy.Matrix([1.5, 1.5]) + shift = sympy.Matrix((0.0, 0.0)) + + for address_mode in ['border', 'wrap', 'clamp', 'mirror']: + previous_result = None + for dtype in [np.float64, np.float32, np.int32]: + if dtype == np.int32: + lenna_gpu = gpuarray.to_gpu( + np.ascontiguousarray(lenna * 255, dtype)) + else: + lenna_gpu = gpuarray.to_gpu( + np.ascontiguousarray(lenna, dtype)) + for use_textures in [True, False]: + x_f, y_f = pystencils.fields('x,y: %s [2d]' % type_map[dtype], ghost_layers=0) + + if use_textures: + transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift + else: + transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed) + }) + # print(assignments) + ast = pystencils.create_kernel(assignments, target='gpu', use_textures_for_interpolation=use_textures) + # print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + out = gpuarray.zeros_like(lenna_gpu) + kernel(x=lenna_gpu, y=out) + pyconrad.imshow(out, + f"out {address_mode} texture:{use_textures} {type_map[dtype]}") + skimage.io.imsave(f"/tmp/out {address_mode} texture:{use_textures} {type_map[dtype]}.tif", + np.ascontiguousarray(out.get(), np.float32)) + # if not (use_single_precision and use_textures): + # if previous_result is not None: + # try: + # assert np.allclose(previous_result[4:-4, 4:-4], out.get() + # [4:-4, 4:-4], rtol=1e-3, atol=1e-2) + # except AssertionError as e: + # print("Max error: %f" % np.max(np.abs(previous_result[4:-4, 4:-4] - out.get()[4:-4, 4:-4]))) + # pyconrad.imshow(previous_result[4:-4, 4:-4] - out.get()[4:-4, 4:-4], "Difference image") + # raise e + # previous_result = out.get() + + +def test_rotate_interpolation_size_change(): + x_f, y_f = pystencils.fields('x,y: float64 [2d]') + + rotation_angle = sympy.pi / 5 + + for address_mode in ['border', 'wrap', 'clamp', 'mirror']: + + transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + assignments = pystencils.AssignmentCollection({ + y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed) + }) + print(assignments) + ast = pystencils.create_kernel(assignments) + print(ast) + print(pystencils.show_code(ast)) + kernel = ast.compile() + + out = np.zeros((100, 150), np.float64) + kernel(x=lenna, y=out) + pyconrad.imshow(out, "small out " + address_mode) + + +def main(): + + test_shift_interpolation_gpu() + + +if __name__ == '__main__': + main()