Skip to content
Snippets Groups Projects
Commit 30da6576 authored by Jan Hönig's avatar Jan Hönig
Browse files

Merge branch 'RemoveOpenCL' into 'master'

Removed OpenCL

See merge request !278
parents 0ed1a87b 9afc38bb
Branches
Tags last/Kerncraft
No related merge requests found
Pipeline #42282 failed
Showing
with 102 additions and 619 deletions
...@@ -4,3 +4,4 @@ ...@@ -4,3 +4,4 @@
### Removed ### Removed
* LLVM backend because it was not used much and not good integrated in pystencils. * LLVM backend because it was not used much and not good integrated in pystencils.
* OpenCL backend because it was not used much and not good integrated in pystencils.
...@@ -53,7 +53,6 @@ Without `[interactive]` you get a minimal version with very little dependencies. ...@@ -53,7 +53,6 @@ Without `[interactive]` you get a minimal version with very little dependencies.
All options: All options:
- `gpu`: use this if an NVIDIA GPU is available and CUDA is installed - `gpu`: use this if an NVIDIA GPU is available and CUDA is installed
- `opencl`: basic OpenCL support (experimental)
- `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl - `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl
- `bench_db`: functionality to store benchmark result in object databases - `bench_db`: functionality to store benchmark result in object databases
- `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc. - `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc.
......
This diff is collapsed.
...@@ -47,7 +47,7 @@ def generate_c(ast_node: Node, ...@@ -47,7 +47,7 @@ def generate_c(ast_node: Node,
Args: Args:
ast_node: ast representation of kernel ast_node: ast representation of kernel
signature_only: generate signature without function body signature_only: generate signature without function body
dialect: `Backend`: 'C', 'CUDA' or 'OPENCL' dialect: `Backend`: 'C' or 'CUDA'
custom_backend: use own custom printer for code generation custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables with_globals: enable usage of global variables
Returns: Returns:
...@@ -71,9 +71,6 @@ def generate_c(ast_node: Node, ...@@ -71,9 +71,6 @@ def generate_c(ast_node: Node,
elif dialect == Backend.CUDA: elif dialect == Backend.CUDA:
from pystencils.backends.cuda_backend import CudaBackend from pystencils.backends.cuda_backend import CudaBackend
printer = CudaBackend(signature_only=signature_only) printer = CudaBackend(signature_only=signature_only)
elif dialect == Backend.OPENCL:
from pystencils.backends.opencl_backend import OpenClBackend
printer = OpenClBackend(signature_only=signature_only)
else: else:
raise ValueError(f'Unknown {dialect=}') raise ValueError(f'Unknown {dialect=}')
code = printer(ast_node) code = printer(ast_node)
......
acos
acosh
acospi
asin
asinh
asinpi
atan
atan2
atanh
atanpi
atan2pi
cbrt
ceil
copysign
cos
cosh
cospi
erfc
erf
exp
exp2
exp10
expm1
fabs
fdim
floor
fma
fmax
fmax
fmin45
fmin
fmod
fract
frexp
hypot
ilogb
ldexp
lgamma
lgamma_r
log
log2
log10
log1p
logb
mad
maxmag
minmag
modf
nextafter
pow
pown
powr
remquo
intn
remquo
rint
rootn
rootn
round
rsqrt
sin
sincos
sinh
sinpi
sqrt
tan
tanh
tanpi
tgamma
trunc
half_cos
half_divide
half_exp
half_exp2
half_exp10
half_log
half_log2
half_log10
half_powr
half_recip
half_rsqrt
half_sin
half_sqrt
half_tan
native_cos
native_divide
native_exp
native_exp2
native_exp10
native_log
native_log2
native_log10
native_powr
native_recip
native_rsqrt
native_sin
native_sqrt
native_tan
from os.path import dirname, join
import pystencils.data_types
from pystencils.astnodes import Node
from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
from pystencils.backends.cuda_backend import CudaBackend, CudaSympyPrinter
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
with open(join(dirname(__file__), 'opencl1.1_known_functions.txt')) as f:
lines = f.readlines()
OPENCL_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_opencl(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
"""Prints an abstract syntax tree node (made for `Target` 'GPU') as OpenCL code. # TODO Backend instead of Target?
Args:
ast_node: ast representation of kernel
signature_only: generate signature without function body
custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables
Returns:
OpenCL code for the ast node and its descendants
"""
return generate_c(ast_node, signature_only, dialect=Backend.OPENCL,
custom_backend=custom_backend, with_globals=with_globals)
class OpenClBackend(CudaBackend):
def __init__(self,
sympy_printer=None,
signature_only=False):
if not sympy_printer:
sympy_printer = OpenClSympyPrinter()
super().__init__(sympy_printer, signature_only)
self._dialect = Backend.OPENCL
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadBlockSynchronization(self, node):
raise NotImplementedError()
def _print_TextureDeclaration(self, node):
raise NotImplementedError()
class OpenClSympyPrinter(CudaSympyPrinter):
language = "OpenCL"
DIMENSION_MAPPING = {
'x': '0',
'y': '1',
'z': '2'
}
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'get_group_id',
'threadIdx': 'get_local_id',
'blockDim': 'get_local_size',
'gridDim': 'get_global_size'
}
def __init__(self):
CustomSympyPrinter.__init__(self)
self.known_functions = OPENCL_KNOWN_FUNCTIONS
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
dimension = self.DIMENSION_MAPPING[dimension]
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
return f"(int64_t) {function_name}({dimension})"
def _print_TextureAccess(self, node):
raise NotImplementedError()
# For math functions, OpenCL is more similar to the C++ printer CustomSympyPrinter
# since built-in math functions are generic.
# In CUDA, you have to differentiate between `sin` and `sinf`
try:
_print_math_func = CustomSympyPrinter._print_math_func
except AttributeError:
pass
_print_Pow = CustomSympyPrinter._print_Pow
def _print_Function(self, expr):
if isinstance(expr, fast_division):
return "native_divide(%s, %s)" % tuple(self._print(a) for a in expr.args)
elif isinstance(expr, fast_sqrt):
return f"native_sqrt({tuple(self._print(a) for a in expr.args)})"
elif isinstance(expr, fast_inv_sqrt):
return f"native_rsqrt({tuple(self._print(a) for a in expr.args)})"
return CustomSympyPrinter._print_Function(self, expr)
...@@ -23,8 +23,7 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -23,8 +23,7 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_layout: str = 'SoA', default_layout: str = 'SoA',
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
parallel: bool = False, parallel: bool = False,
default_ghost_layers: int = 1, default_ghost_layers: int = 1) -> DataHandling:
opencl_queue=None) -> DataHandling:
"""Creates a data handling instance. """Creates a data handling instance.
Args: Args:
...@@ -43,7 +42,6 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -43,7 +42,6 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_target = new_target default_target = new_target
if parallel: if parallel:
assert not opencl_queue, "OpenCL is only supported for SerialDataHandling"
if wlb is None: if wlb is None:
raise ValueError("Cannot create parallel data handling because walberla module is not available") raise ValueError("Cannot create parallel data handling because walberla module is not available")
...@@ -71,8 +69,7 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -71,8 +69,7 @@ def create_data_handling(domain_size: Tuple[int, ...],
periodicity=periodicity, periodicity=periodicity,
default_target=default_target, default_target=default_target,
default_layout=default_layout, default_layout=default_layout,
default_ghost_layers=default_ghost_layers, default_ghost_layers=default_ghost_layers)
opencl_queue=opencl_queue)
__all__ = ['create_data_handling'] __all__ = ['create_data_handling']
...@@ -17,8 +17,8 @@ class DataHandling(ABC): ...@@ -17,8 +17,8 @@ class DataHandling(ABC):
'gather' function that has collects (parts of the) distributed data on a single process. 'gather' function that has collects (parts of the) distributed data on a single process.
""" """
_GPU_LIKE_TARGETS = [Target.GPU, Target.OPENCL] _GPU_LIKE_TARGETS = [Target.GPU]
_GPU_LIKE_BACKENDS = [Backend.CUDA, Backend.OPENCL] _GPU_LIKE_BACKENDS = [Backend.CUDA]
# ---------------------------- Adding and accessing data ----------------------------------------------------------- # ---------------------------- Adding and accessing data -----------------------------------------------------------
@property @property
......
try:
import pyopencl.array as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyOpenClArrayHandler:
def __init__(self, queue):
if not queue:
from pystencils.opencl.opencljit import get_global_cl_queue
queue = get_global_cl_queue()
assert queue, "OpenCL queue missing!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
self.queue = queue
def zeros(self, shape, dtype=np.float64, order='C'):
cpu_array = np.zeros(shape=shape, dtype=dtype, order=order)
return self.to_gpu(cpu_array)
def ones(self, shape, dtype=np.float64, order='C'):
cpu_array = np.ones(shape=shape, dtype=dtype, order=order)
return self.to_gpu(cpu_array)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape=shape, dtype=dtype, layout=layout)
return self.to_gpu(cpu_array)
else:
return gpuarray.empty(self.queue, shape, dtype)
def to_gpu(self, array):
return gpuarray.to_device(self.queue, array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array, self.queue)
def download(self, gpuarray, numpy_array):
gpuarray.get(self.queue, numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.from_numpy(cpu_array)
from_numpy = to_gpu
...@@ -7,7 +7,6 @@ import numpy as np ...@@ -7,7 +7,6 @@ import numpy as np
from pystencils.datahandling.blockiteration import SerialBlock from pystencils.datahandling.blockiteration import SerialBlock
from pystencils.datahandling.datahandling_interface import DataHandling from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler
from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
from pystencils.enums import Target from pystencils.enums import Target
from pystencils.field import ( from pystencils.field import (
Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple,
...@@ -24,8 +23,6 @@ class SerialDataHandling(DataHandling): ...@@ -24,8 +23,6 @@ class SerialDataHandling(DataHandling):
default_layout: str = 'SoA', default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, periodicity: Union[bool, Sequence[bool]] = False,
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
opencl_queue=None,
opencl_ctx=None,
array_handler=None) -> None: array_handler=None) -> None:
""" """
Creates a data handling for single node simulations. Creates a data handling for single node simulations.
...@@ -48,17 +45,12 @@ class SerialDataHandling(DataHandling): ...@@ -48,17 +45,12 @@ class SerialDataHandling(DataHandling):
self.custom_data_cpu = DotDict() self.custom_data_cpu = DotDict()
self.custom_data_gpu = DotDict() self.custom_data_gpu = DotDict()
self._custom_data_transfer_functions = {} self._custom_data_transfer_functions = {}
self._opencl_queue = opencl_queue
self._opencl_ctx = opencl_ctx
if not array_handler: if not array_handler:
try: try:
self.array_handler = PyCudaArrayHandler() self.array_handler = PyCudaArrayHandler()
except Exception: except Exception:
self.array_handler = PyCudaNotAvailableHandler() self.array_handler = PyCudaNotAvailableHandler()
if default_target == Target.OPENCL or opencl_queue:
self.array_handler = PyOpenClArrayHandler(opencl_queue)
else: else:
self.array_handler = array_handler self.array_handler = array_handler
...@@ -280,8 +272,6 @@ class SerialDataHandling(DataHandling): ...@@ -280,8 +272,6 @@ class SerialDataHandling(DataHandling):
def synchronization_function(self, names, stencil=None, target=None, functor=None, **_): def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
if target is None: if target is None:
target = self.default_target target = self.default_target
if target == Target.OPENCL: # TODO potential misuse between Target and Backend
target = Target.GPU
assert target in (Target.CPU, Target.GPU) assert target in (Target.CPU, Target.GPU)
if not hasattr(names, '__len__') or type(names) is str: if not hasattr(names, '__len__') or type(names) is str:
names = [names] names = [names]
...@@ -324,16 +314,13 @@ class SerialDataHandling(DataHandling): ...@@ -324,16 +314,13 @@ class SerialDataHandling(DataHandling):
else: else:
if functor is None: if functor is None:
from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor
target = Target.GPU if not isinstance(self.array_handler, target = Target.GPU
PyOpenClArrayHandler) else Target.OPENCL
result.append(functor(filtered_stencil, self._domainSize, result.append(functor(filtered_stencil, self._domainSize,
index_dimensions=self.fields[name].index_dimensions, index_dimensions=self.fields[name].index_dimensions,
index_dim_shape=values_per_cell, index_dim_shape=values_per_cell,
dtype=self.fields[name].dtype.numpy_dtype, dtype=self.fields[name].dtype.numpy_dtype,
ghost_layers=gls, ghost_layers=gls,
target=target, target=target))
opencl_queue=self._opencl_queue,
opencl_ctx=self._opencl_ctx))
if target == Target.CPU: if target == Target.CPU:
def result_functor(): def result_functor():
......
...@@ -46,7 +46,7 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None) ...@@ -46,7 +46,7 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None)
if isinstance(ast, KernelWrapper): if isinstance(ast, KernelWrapper):
ast = ast.ast ast = ast.ast
if ast.backend not in {Backend.C, Backend.CUDA, Backend.OPENCL}: if ast.backend not in {Backend.C, Backend.CUDA}:
raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}') raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}')
dialect = ast.backend dialect = ast.backend
......
...@@ -13,10 +13,6 @@ class Target(Enum): ...@@ -13,10 +13,6 @@ class Target(Enum):
""" """
Target GPU architecture. Target GPU architecture.
""" """
OPENCL = auto()
"""
Target all architectures OpenCL covers (Thus both, Target and Backend)
"""
class Backend(Enum): class Backend(Enum):
...@@ -32,7 +28,3 @@ class Backend(Enum): ...@@ -32,7 +28,3 @@ class Backend(Enum):
""" """
Use the CUDA backend to generate code for NVIDIA GPUs. Use the CUDA backend to generate code for NVIDIA GPUs.
""" """
OPENCL = auto()
"""
Use the OpenCL backend to generate code for OpenCL.
"""
...@@ -2,7 +2,6 @@ import numpy as np ...@@ -2,7 +2,6 @@ import numpy as np
from itertools import product from itertools import product
import pystencils.gpucuda import pystencils.gpucuda
import pystencils.opencl
from pystencils import Assignment, Field from pystencils import Assignment, Field
from pystencils.gpucuda.kernelcreation import create_cuda_kernel from pystencils.gpucuda.kernelcreation import create_cuda_kernel
from pystencils.enums import Target from pystencils.enums import Target
...@@ -32,19 +31,14 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in ...@@ -32,19 +31,14 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
thickness=None, dtype=float, target=Target.GPU, opencl_queue=None, opencl_ctx=None): thickness=None, dtype=float, target=Target.GPU):
assert target in {Target.GPU, Target.OPENCL} assert target in {Target.GPU}
src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
kernels = [] kernels = []
for src_slice, dst_slice in src_dst_slice_tuples: for src_slice, dst_slice in src_dst_slice_tuples:
ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype) ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
if target == pystencils.Target.GPU: kernels.append(pystencils.gpucuda.make_python_function(ast))
kernels.append(pystencils.gpucuda.make_python_function(ast))
else:
ast._target = pystencils.Target.OPENCL
ast._backend = pystencils.Backend.OPENCL
kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx))
def functor(pdfs, **_): def functor(pdfs, **_):
for kernel in kernels: for kernel in kernels:
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
from typing import Union
import numpy as np
try:
import pycuda.driver as cuda
from pycuda import gpuarray
import pycuda
except Exception:
pass
def ndarray_to_tex(tex_ref, # type: Union[cuda.TextureReference, cuda.SurfaceReference]
ndarray,
address_mode=None,
filter_mode=None,
use_normalized_coordinates=False,
read_as_integer=False):
if isinstance(address_mode, str):
address_mode = getattr(pycuda.driver.address_mode, address_mode.upper())
if address_mode is None:
address_mode = cuda.address_mode.BORDER
if filter_mode is None:
filter_mode = cuda.filter_mode.LINEAR
if isinstance(ndarray, np.ndarray):
cu_array = cuda.np_to_array(ndarray, 'C')
elif isinstance(ndarray, gpuarray.GPUArray):
cu_array = cuda.gpuarray_to_array(ndarray, 'C')
else:
raise TypeError(
'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
tex_ref.set_array(cu_array)
tex_ref.set_address_mode(0, address_mode)
if ndarray.ndim >= 2:
tex_ref.set_address_mode(1, address_mode)
if ndarray.ndim >= 3:
tex_ref.set_address_mode(2, address_mode)
tex_ref.set_filter_mode(filter_mode)
if not use_normalized_coordinates:
tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
if not read_as_integer:
tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
import functools
import itertools import itertools
import warnings import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
...@@ -105,14 +104,6 @@ class CreateKernelConfig: ...@@ -105,14 +104,6 @@ class CreateKernelConfig:
""" """
If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust. If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
""" """
opencl_queue: Any = None
"""
OpenCL queue if OpenCL target is used.
"""
opencl_ctx: Any = None
"""
OpenCL context if OpenCL target is used.
"""
index_fields: List[Field] = None index_fields: List[Field] = None
""" """
List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel` List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
...@@ -136,8 +127,6 @@ class CreateKernelConfig: ...@@ -136,8 +127,6 @@ class CreateKernelConfig:
self.backend = Backend.C self.backend = Backend.C
elif self.target == Target.GPU: elif self.target == Target.GPU:
self.backend = Backend.CUDA self.backend = Backend.CUDA
elif self.target == Target.OPENCL:
self.backend = Backend.OPENCL
else: else:
raise NotImplementedError(f'Target {self.target} has no default backend') raise NotImplementedError(f'Target {self.target} has no default backend')
...@@ -274,20 +263,14 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -274,20 +263,14 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
raise ValueError("Blocking cannot be combined with cacheline-zeroing") raise ValueError("Blocking cannot be combined with cacheline-zeroing")
else: else:
raise ValueError("Invalid value for cpu_vectorize_info") raise ValueError("Invalid value for cpu_vectorize_info")
elif config.target == Target.GPU or config.target == Target.OPENCL: elif config.target == Target.GPU:
if config.backend == Backend.CUDA or config.backend == Backend.OPENCL: if config.backend == Backend.CUDA:
from pystencils.gpucuda import create_cuda_kernel from pystencils.gpucuda import create_cuda_kernel
ast = create_cuda_kernel(assignments, function_name=config.function_name, type_info=config.data_type, ast = create_cuda_kernel(assignments, function_name=config.function_name, type_info=config.data_type,
indexing_creator=indexing_creator_from_params(config.gpu_indexing, indexing_creator=indexing_creator_from_params(config.gpu_indexing,
config.gpu_indexing_params), config.gpu_indexing_params),
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers, iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check) skip_independence_check=config.skip_independence_check)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
ast.compile = functools.partial(make_python_function, ast, config.opencl_queue, config.opencl_ctx)
ast._target = config.target
ast._backend = config.backend
if not ast: if not ast:
raise NotImplementedError( raise NotImplementedError(
...@@ -349,8 +332,8 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -349,8 +332,8 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
coordinate_names=config.coordinate_names) coordinate_names=config.coordinate_names)
if config.cpu_openmp: if config.cpu_openmp:
add_openmp(ast, num_threads=config.cpu_openmp) add_openmp(ast, num_threads=config.cpu_openmp)
elif config.target == Target.GPU or config.target == Target.OPENCL: elif config.target == Target.GPU:
if config.backend == Backend.CUDA or config.backend == Backend.OPENCL: if config.backend == Backend.CUDA:
from pystencils.gpucuda import created_indexed_cuda_kernel from pystencils.gpucuda import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) idx_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
ast = created_indexed_cuda_kernel(assignments, ast = created_indexed_cuda_kernel(assignments,
...@@ -358,12 +341,6 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -358,12 +341,6 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
type_info=config.data_type, type_info=config.data_type,
coordinate_names=config.coordinate_names, coordinate_names=config.coordinate_names,
indexing_creator=idx_creator) indexing_creator=idx_creator)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
ast.compile = functools.partial(make_python_function, ast, config.opencl_queue, config.opencl_ctx)
ast._target = config.target
ast._backend = config.backend
if not ast: if not ast:
raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}') raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
......
"""
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
"""
Automatically initializes OpenCL context using any device.
Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
try:
init_globally()
except Exception as e:
import warnings
warnings.warn(str(e))
import numpy as np
from pystencils.backends.cbackend import get_headers
from pystencils.backends.opencl_backend import generate_opencl
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
USE_FAST_MATH = True
_global_cl_ctx = None
_global_cl_queue = None
def get_global_cl_queue():
return _global_cl_queue
def get_global_cl_ctx():
return _global_cl_ctx
def init_globally(device_index=0):
import pyopencl as cl
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = cl.create_some_context(device_index)
_global_cl_queue = cl.CommandQueue(_global_cl_ctx)
def init_globally_with_context(opencl_ctx, opencl_queue):
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = opencl_ctx
_global_cl_queue = opencl_queue
def clear_global_ctx():
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = None
_global_cl_queue = None
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='Target.GPU'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
if not opencl_ctx:
opencl_ctx = _global_cl_ctx
if not opencl_queue:
opencl_queue = _global_cl_queue
assert opencl_ctx, "No valid OpenCL context!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
assert opencl_queue, "No valid OpenCL queue!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
if argument_dict is None:
argument_dict = {}
# check if double precision is supported and required
if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
for param in kernel_function_node.get_parameters():
if param.symbol.dtype.base_type:
if param.symbol.dtype.base_type.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
else:
if param.symbol.dtype.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_opencl(kernel_function_node, custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations")
options.append("-cl-mad-enable")
options.append("-cl-fast-relaxed-math")
options.append("-cl-finite-math-only")
options.append("-I")
options.append(get_pystencils_include_path())
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
assert not any(isinstance(a, np.ndarray)
for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!'
assert not any('pycuda' in str(type(a))
for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!'
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
return wrapper
...@@ -5,7 +5,6 @@ import sympy as sp ...@@ -5,7 +5,6 @@ import sympy as sp
from pystencils.data_types import TypedSymbol, cast_func from pystencils.data_types import TypedSymbol, cast_func
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.enums import Backend
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
...@@ -54,8 +53,7 @@ class RNGBase(CustomCodeNode): ...@@ -54,8 +53,7 @@ class RNGBase(CustomCodeNode):
else: else:
code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \ code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \
f"{r.name};\n" f"{r.name};\n"
args = [print_arg(a) for a in self.args] + \ args = [print_arg(a) for a in self.args] + ['' + r.name for r in self.result_symbols]
[('&' if dialect == Backend.OPENCL else '') + r.name for r in self.result_symbols]
code += (self._name + "(" + ", ".join(args) + ");\n") code += (self._name + "(" + ", ".join(args) + ");\n")
return code return code
......
...@@ -10,9 +10,6 @@ def test_create_kernel_config(): ...@@ -10,9 +10,6 @@ def test_create_kernel_config():
c = ps.CreateKernelConfig(target=ps.Target.GPU) c = ps.CreateKernelConfig(target=ps.Target.GPU)
assert c.backend == ps.Backend.CUDA assert c.backend == ps.Backend.CUDA
c = ps.CreateKernelConfig(target=ps.Target.OPENCL)
assert c.backend == ps.Backend.OPENCL
c = ps.CreateKernelConfig(backend=ps.Backend.CUDA) c = ps.CreateKernelConfig(backend=ps.Backend.CUDA)
assert c.target == ps.Target.CPU assert c.target == ps.Target.CPU
assert c.backend == ps.Backend.CUDA assert c.backend == ps.Backend.CUDA
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment