Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (12)
Showing
with 202 additions and 98 deletions
...@@ -84,21 +84,21 @@ latest-python: ...@@ -84,21 +84,21 @@ latest-python:
# Minimal tests in windows environment # Minimal tests in windows environment
minimal-windows: #minimal-windows:
stage: test # stage: test
except: # except:
variables: # variables:
- $ENABLE_NIGHTLY_BUILDS # - $ENABLE_NIGHTLY_BUILDS
tags: # tags:
- win # - win
script: # script:
- export NUM_CORES=$(nproc --all) # - export NUM_CORES=$(nproc --all)
- source /cygdrive/c/Users/build/Miniconda3/Scripts/activate # - source /cygdrive/c/Users/build/Miniconda3/Scripts/activate
- source activate pystencils # - source activate pystencils
- pip install joblib # - pip install joblib
- pip list # - pip list
- python -c "import numpy" # - python -c "import numpy"
- py.test -v -m "not (notebook or longrun)" # - py.test -v -m "not (notebook or longrun)"
ubuntu: ubuntu:
stage: test stage: test
......
...@@ -40,12 +40,3 @@ from ._version import get_versions ...@@ -40,12 +40,3 @@ from ._version import get_versions
__version__ = get_versions()['version'] __version__ = get_versions()['version']
del get_versions del get_versions
# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
try:
import cupy
if cupy.cuda.runtime.getDeviceCount() > 0:
GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
except ImportError:
pass
...@@ -18,6 +18,7 @@ try: ...@@ -18,6 +18,7 @@ try:
import waLBerla as wlb import waLBerla as wlb
if wlb.cpp_available: if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else: else:
ParallelDataHandling = None ParallelDataHandling = None
except ImportError: except ImportError:
...@@ -100,7 +101,7 @@ class BoundaryHandling: ...@@ -100,7 +101,7 @@ class BoundaryHandling:
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling): if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
array_handler = GPUArrayHandler() array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else: else:
array_handler = self.data_handling.array_handler array_handler = self.data_handling.array_handler
......
...@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_layout: str = 'SoA', default_layout: str = 'SoA',
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
parallel: bool = False, parallel: bool = False,
default_ghost_layers: int = 1) -> DataHandling: default_ghost_layers: int = 1,
device_number: Union[int, None] = None) -> DataHandling:
"""Creates a data handling instance. """Creates a data handling instance.
Args: Args:
...@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_target: `Target` default_target: `Target`
parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
specified. If none is given, the device with the largest amount of memory is used. If multiple
devices have the same amount of memory, the one with the lower number is used
""" """
if isinstance(default_target, str): if isinstance(default_target, str):
new_target = Target[default_target.upper()] new_target = Target[default_target.upper()]
...@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
periodicity=periodicity, periodicity=periodicity,
default_target=default_target, default_target=default_target,
default_layout=default_layout, default_layout=default_layout,
default_ghost_layers=default_ghost_layers) default_ghost_layers=default_ghost_layers,
device_number=device_number)
__all__ = ['create_data_handling'] __all__ = ['create_data_handling']
...@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling): ...@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
default_layout: str = 'SoA', default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, periodicity: Union[bool, Sequence[bool]] = False,
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
array_handler=None) -> None: array_handler=None,
device_number=None) -> None:
""" """
Creates a data handling for single node simulations. Creates a data handling for single node simulations.
...@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling): ...@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
domain_size: size of the spatial domain as tuple domain_size: size of the spatial domain as tuple
default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
default_layout: default layout used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method
periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
False (non-periodic)
default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
allocated if not overwritten in add_array, and synchronization functions are for the GPU by allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default default
array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
the device with the largest amount of memory is used. If multiple devices have the same
amount of memory, the one with the lower number is used
""" """
super(SerialDataHandling, self).__init__() super(SerialDataHandling, self).__init__()
self._domainSize = tuple(domain_size) self._domainSize = tuple(domain_size)
...@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling): ...@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):
if not array_handler: if not array_handler:
try: try:
self.array_handler = GPUArrayHandler() if device_number is None:
except Exception: import cupy.cuda.runtime
if cupy.cuda.runtime.getDeviceCount() > 0:
device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
self.array_handler = GPUArrayHandler(device_number)
except ImportError:
self.array_handler = GPUNotAvailableHandler() self.array_handler = GPUNotAvailableHandler()
else: else:
self.array_handler = array_handler self.array_handler = array_handler
......
...@@ -6,30 +6,28 @@ except ImportError: ...@@ -6,30 +6,28 @@ except ImportError:
cpx = None cpx = None
import numpy as np import numpy as np
import pystencils
class GPUArrayHandler: class GPUArrayHandler:
@staticmethod def __init__(self, device_number):
def zeros(shape, dtype=np.float64, order='C'): self._device_number = device_number
with cp.cuda.Device(pystencils.GPU_DEVICE):
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order) return cp.zeros(shape=shape, dtype=dtype, order=order)
@staticmethod def ones(self, shape, dtype=np.float64, order='C'):
def ones(shape, dtype=np.float64, order='C'): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.ones(shape=shape, dtype=dtype, order=order) return cp.ones(shape=shape, dtype=dtype, order=order)
@staticmethod def empty(self, shape, dtype=np.float64, order='C'):
def empty(shape, dtype=np.float64, order='C'): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.empty(shape=shape, dtype=dtype, order=order) return cp.empty(shape=shape, dtype=dtype, order=order)
@staticmethod def to_gpu(self, numpy_array):
def to_gpu(numpy_array):
swaps = _get_index_swaps(numpy_array) swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base) gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps): for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b) gpu_array = gpu_array.swapaxes(a, b)
...@@ -37,27 +35,26 @@ class GPUArrayHandler: ...@@ -37,27 +35,26 @@ class GPUArrayHandler:
else: else:
return cp.asarray(numpy_array) return cp.asarray(numpy_array)
@staticmethod def upload(self, array, numpy_array):
def upload(array, numpy_array): assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base) array.base.set(numpy_array.base)
else: else:
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
array.set(numpy_array) array.set(numpy_array)
@staticmethod def download(self, array, numpy_array):
def download(array, numpy_array): assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get() numpy_array.base[:] = array.base.get()
else: else:
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get() numpy_array[:] = array.get()
@staticmethod def randn(self, shape, dtype=np.float64):
def randn(shape, dtype=np.float64): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.random.randn(*shape, dtype=dtype) return cp.random.randn(*shape, dtype=dtype)
@staticmethod @staticmethod
......
import numpy as np import numpy as np
import pystencils
from pystencils.backends.cbackend import get_headers from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType from pystencils.typing import StructType
...@@ -42,6 +41,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -42,6 +41,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
headers = get_headers(kernel_function_node) headers = get_headers(kernel_function_node)
if cp.cuda.runtime.is_hip: if cp.cuda.runtime.is_hip:
headers.add('"gpu_defines.h"') headers.add('"gpu_defines.h"')
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<hip/hip_fp16.h>')
else: else:
headers.update({'"gpu_defines.h"', '<cstdint>'}) headers.update({'"gpu_defines.h"', '<cstdint>'})
for field in kernel_function_node.fields_accessed: for field in kernel_function_node.fields_accessed:
...@@ -72,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -72,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
for k, v in kwargs.items())) for k, v in kwargs.items()))
try: try:
args, block_and_thread_numbers = cache[key] args, block_and_thread_numbers = cache[key]
with cp.cuda.Device(pystencils.GPU_DEVICE): device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError: except KeyError:
full_arguments = argument_dict.copy() full_arguments = argument_dict.copy()
...@@ -87,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -87,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
args = tuple(_build_numpy_argument_list(parameters, full_arguments)) args = tuple(_build_numpy_argument_list(parameters, full_arguments))
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
with cp.cuda.Device(pystencils.GPU_DEVICE): device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging: # useful for debugging:
# with cp.cuda.Device(pystencils.GPU_DEVICE): # cp.cuda.runtime.deviceSynchronize()
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node ast = kernel_function_node
......
import abc import abc
from functools import partial from functools import partial
import math
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional
from pystencils.typing import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor
...@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing): ...@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing):
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
""" """
def __init__(self, field, iteration_slice, def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
...@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing): ...@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing):
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits # Get device limits
import cupy as cp import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE) # See https://github.com/cupy/cupy/issues/7676
da = device.attributes if cp.cuda.runtime.is_hip:
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"]) maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
else:
da = cp.cuda.Device(device_number).attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property @property
def cuda_indices(self): def cuda_indices(self):
...@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing): ...@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing):
def iteration_space(self, arr_shape): def iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape) return _iteration_space(self._iterationSlice, arr_shape)
@staticmethod def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): """Shrinks the block_size if there are too many registers used per block.
"""Shrinks the block_size if there are too many registers used per multiprocessor.
This is not done automatically, since the required_registers_per_thread are not known before compilation. This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function. They can be obtained by ``func.num_regs`` from a cupy function.
:returns smaller block_size if too many registers are used. Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
""" """
import cupy as cp import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
available_registers_per_mp = da.get("MaxRegistersPerMultiprocessor") # See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
block = block_size max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True: while True:
num_threads = 1 required_registers = math.prod(result) * required_registers_per_thread
for t in block: if required_registers <= max_registers_per_block:
num_threads *= t return result
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
else: else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e]) largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert block[largest_grid_entry_idx] >= 2 assert result[largest_list_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2 result[largest_list_entry_idx] //= 2
@staticmethod @staticmethod
def permute_block_size_according_to_layout(block_size, layout): def permute_block_size_according_to_layout(block_size, layout):
......
...@@ -3,3 +3,10 @@ ...@@ -3,3 +3,10 @@
#define POS_INFINITY __int_as_float(0x7f800000) #define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY #define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000) #define NEG_INFINITY __int_as_float(0xff800000)
#ifdef __HIPCC_RTC__
typedef __hip_uint8_t uint8_t;
typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
#ifndef __OPENCL_VERSION__ #if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#endif #endif
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#endif #endif
#endif #endif
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#define QUALIFIERS static __forceinline__ __device__ #define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__) #elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline #define QUALIFIERS static inline
...@@ -59,7 +59,9 @@ ...@@ -59,7 +59,9 @@
typedef uint32_t uint32; typedef uint32_t uint32;
typedef uint64_t uint64; typedef uint64_t uint64;
#else #else
#ifndef __HIPCC_RTC__
#include <cstdint> #include <cstdint>
#endif
typedef std::uint32_t uint32; typedef std::uint32_t uint32;
typedef std::uint64_t uint64; typedef std::uint64_t uint64;
#endif #endif
...@@ -75,7 +77,7 @@ typedef svfloat64_t svfloat64_st; ...@@ -75,7 +77,7 @@ typedef svfloat64_t svfloat64_st;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{ {
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
// host code // host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__)) #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b); *hip = __mulhwu(a,b);
...@@ -186,7 +188,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 ...@@ -186,7 +188,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
#endif #endif
} }
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{ {
......
...@@ -5,6 +5,8 @@ from typing import Sequence ...@@ -5,6 +5,8 @@ from typing import Sequence
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.utils import binary_numbers
def inverse_direction(direction): def inverse_direction(direction):
"""Returns inverse i.e. negative of given direction tuple """Returns inverse i.e. negative of given direction tuple
...@@ -293,6 +295,38 @@ def direction_string_to_offset(direction: str, dim: int = 3): ...@@ -293,6 +295,38 @@ def direction_string_to_offset(direction: str, dim: int = 3):
return offset[:dim] return offset[:dim]
def adjacent_directions(direction):
"""
Returns all adjacent directions for a direction as tuple of tuples. This is useful for exmple to find all directions
relevant for neighbour communication.
Args:
direction: tuple representing a direction. For example (0, 1, 0) for the northern side
Examples:
>>> adjacent_directions((0, 0, 0))
((0, 0, 0),)
>>> adjacent_directions((0, 1, 0))
((0, 1, 0),)
>>> adjacent_directions((0, 1, 1))
((0, 0, 1), (0, 1, 0), (0, 1, 1))
>>> adjacent_directions((-1, -1))
((-1, -1), (-1, 0), (0, -1))
"""
result = set()
if all(e == 0 for e in direction):
result.add(direction)
return tuple(result)
binary_numbers_list = binary_numbers(len(direction))
for adjacent_direction in binary_numbers_list:
for i, entry in enumerate(direction):
if entry == 0:
adjacent_direction[i] = 0
if entry == -1 and adjacent_direction[i] == 1:
adjacent_direction[i] = -1
if not all(e == 0 for e in adjacent_direction):
result.add(tuple(adjacent_direction))
return tuple(sorted(result))
# -------------------------------------- Visualization ----------------------------------------------------------------- # -------------------------------------- Visualization -----------------------------------------------------------------
......
...@@ -96,6 +96,21 @@ def boolean_array_bounding_box(boolean_array): ...@@ -96,6 +96,21 @@ def boolean_array_bounding_box(boolean_array):
return bounds return bounds
def binary_numbers(n):
"""Returns all binary numbers up to 2^n - 1
Example:
>>> binary_numbers(2)
[[0, 0], [0, 1], [1, 0], [1, 1]]
"""
result = list()
for i in range(1 << n):
binary_number = bin(i)[2:]
binary_number = '0' * (n - len(binary_number)) + binary_number
result.append((list(map(int, binary_number))))
return result
class LinearEquationSystem: class LinearEquationSystem:
"""Symbolic linear system of equations - consisting of matrix and right hand side. """Symbolic linear system of equations - consisting of matrix and right hand side.
......
...@@ -47,5 +47,5 @@ def test_custom_backends_gpu(): ...@@ -47,5 +47,5 @@ def test_custom_backends_gpu():
ast = pystencils.create_kernel(normal_assignments, target=Target.GPU) ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
pystencils.show_code(ast, ScreamingGpuBackend()) pystencils.show_code(ast, ScreamingGpuBackend())
with pytest.raises(cupy.cuda.compiler.JitifyException): with pytest.raises((cupy.cuda.compiler.JitifyException, cupy.cuda.compiler.CompileException)):
pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend()) pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
...@@ -15,6 +15,12 @@ except ImportError: ...@@ -15,6 +15,12 @@ except ImportError:
import unittest.mock import unittest.mock
pytest = unittest.mock.MagicMock() pytest = unittest.mock.MagicMock()
try:
import cupy.cuda.runtime
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
SCRIPT_FOLDER = Path(__file__).parent.absolute() SCRIPT_FOLDER = Path(__file__).parent.absolute()
INPUT_FOLDER = SCRIPT_FOLDER / "test_data" INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
...@@ -365,10 +371,11 @@ def test_load_data(): ...@@ -365,10 +371,11 @@ def test_load_data():
assert np.all(dh.cpu_arrays['dst2']) == 0 assert np.all(dh.cpu_arrays['dst2']) == 0
def test_array_handler(): @pytest.mark.parametrize("device_number", device_numbers)
def test_array_handler(device_number):
size = (2, 2) size = (2, 2)
pytest.importorskip('cupy') pytest.importorskip('cupy')
array_handler = GPUArrayHandler() array_handler = GPUArrayHandler(device_number)
zero_array = array_handler.zeros(size) zero_array = array_handler.zeros(size)
cpu_array = np.empty(size) cpu_array = np.empty(size)
......
import pytest
import numpy as np import numpy as np
import cupy as cp import cupy as cp
import sympy as sp import sympy as sp
...@@ -8,6 +10,12 @@ from pystencils.gpu import BlockIndexing ...@@ -8,6 +10,12 @@ from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
try:
import cupy
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
def test_averaging_kernel(): def test_averaging_kernel():
size = (40, 55) size = (40, 55)
...@@ -153,7 +161,8 @@ def test_periodicity(): ...@@ -153,7 +161,8 @@ def test_periodicity():
np.testing.assert_equal(cpu_result, gpu_result) np.testing.assert_equal(cpu_result, gpu_result)
def test_block_indexing(): @pytest.mark.parametrize("device_number", device_numbers)
def test_block_indexing(device_number):
f = fields("f: [3D]") f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
...@@ -162,8 +171,19 @@ def test_block_indexing(): ...@@ -162,8 +171,19 @@ def test_block_indexing():
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto") bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=device_number)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs # This function should be used if number of needed registers is known. Can be determined with func.num_regs
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000) registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
else:
device = cp.cuda.Device(device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
assert np.prod(blocks) * registers_per_thread < max_registers_per_block
assert sum(blocks) < sum([1024, 1024, 1])
...@@ -29,10 +29,6 @@ def offsets_in_plane(normal_plane, offset_int, dimension): ...@@ -29,10 +29,6 @@ def offsets_in_plane(normal_plane, offset_int, dimension):
return result return result
# TODO this fails because the condition of the Conditional is not simplified anymore:
# TODO: ---> transformation.simplify_conditionals
# TODO this should be fixed
@pytest.mark.xfail
def test_staggered_iteration(): def test_staggered_iteration():
dim = 2 dim = 2
f_arr = np.arange(5**dim).reshape([5]*dim).astype(np.float64) f_arr = np.arange(5**dim).reshape([5]*dim).astype(np.float64)
......