Skip to content
Snippets Groups Projects
Commit 32de591c authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'device_selection' into 'master'

Remove pystencils.GPU_DEVICE

See merge request pycodegen/pystencils!336
parents 8e92a559 376ee8d3
Branches
Tags
No related merge requests found
Pipeline #54334 failed
......@@ -40,12 +40,3 @@ from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
try:
import cupy
if cupy.cuda.runtime.getDeviceCount() > 0:
GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
except ImportError:
pass
......@@ -18,6 +18,7 @@ try:
import waLBerla as wlb
if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else:
ParallelDataHandling = None
except ImportError:
......@@ -100,7 +101,7 @@ class BoundaryHandling:
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
array_handler = GPUArrayHandler()
array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else:
array_handler = self.data_handling.array_handler
......
......@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_layout: str = 'SoA',
default_target: Target = Target.CPU,
parallel: bool = False,
default_ghost_layers: int = 1) -> DataHandling:
default_ghost_layers: int = 1,
device_number: Union[int, None] = None) -> DataHandling:
"""Creates a data handling instance.
Args:
......@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_target: `Target`
parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
specified. If none is given, the device with the largest amount of memory is used. If multiple
devices have the same amount of memory, the one with the lower number is used
"""
if isinstance(default_target, str):
new_target = Target[default_target.upper()]
......@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
periodicity=periodicity,
default_target=default_target,
default_layout=default_layout,
default_ghost_layers=default_ghost_layers)
default_ghost_layers=default_ghost_layers,
device_number=device_number)
__all__ = ['create_data_handling']
......@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False,
default_target: Target = Target.CPU,
array_handler=None) -> None:
array_handler=None,
device_number=None) -> None:
"""
Creates a data handling for single node simulations.
......@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
domain_size: size of the spatial domain as tuple
default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
default_layout: default layout used, if not overridden in add_array() method
periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
False (non-periodic)
default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default
array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
the device with the largest amount of memory is used. If multiple devices have the same
amount of memory, the one with the lower number is used
"""
super(SerialDataHandling, self).__init__()
self._domainSize = tuple(domain_size)
......@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):
if not array_handler:
try:
self.array_handler = GPUArrayHandler()
except Exception:
if device_number is None:
import cupy.cuda.runtime
if cupy.cuda.runtime.getDeviceCount() > 0:
device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
self.array_handler = GPUArrayHandler(device_number)
except ImportError:
self.array_handler = GPUNotAvailableHandler()
else:
self.array_handler = array_handler
......
......@@ -6,30 +6,28 @@ except ImportError:
cpx = None
import numpy as np
import pystencils
class GPUArrayHandler:
@staticmethod
def zeros(shape, dtype=np.float64, order='C'):
with cp.cuda.Device(pystencils.GPU_DEVICE):
def __init__(self, device_number):
self._device_number = device_number
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order)
@staticmethod
def ones(shape, dtype=np.float64, order='C'):
with cp.cuda.Device(pystencils.GPU_DEVICE):
def ones(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.ones(shape=shape, dtype=dtype, order=order)
@staticmethod
def empty(shape, dtype=np.float64, order='C'):
with cp.cuda.Device(pystencils.GPU_DEVICE):
def empty(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.empty(shape=shape, dtype=dtype, order=order)
@staticmethod
def to_gpu(numpy_array):
def to_gpu(self, numpy_array):
swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE):
with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b)
......@@ -37,27 +35,26 @@ class GPUArrayHandler:
else:
return cp.asarray(numpy_array)
@staticmethod
def upload(array, numpy_array):
def upload(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE):
with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base)
else:
with cp.cuda.Device(pystencils.GPU_DEVICE):
with cp.cuda.Device(self._device_number):
array.set(numpy_array)
@staticmethod
def download(array, numpy_array):
def download(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE):
with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get()
else:
with cp.cuda.Device(pystencils.GPU_DEVICE):
with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get()
@staticmethod
def randn(shape, dtype=np.float64):
with cp.cuda.Device(pystencils.GPU_DEVICE):
def randn(self, shape, dtype=np.float64):
with cp.cuda.Device(self._device_number):
return cp.random.randn(*shape, dtype=dtype)
@staticmethod
......
import numpy as np
import pystencils
from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType
......@@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
with cp.cuda.Device(pystencils.GPU_DEVICE):
device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError:
full_arguments = argument_dict.copy()
......@@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
args = tuple(_build_numpy_argument_list(parameters, full_arguments))
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
with cp.cuda.Device(pystencils.GPU_DEVICE):
device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging:
# with cp.cuda.Device(pystencils.GPU_DEVICE):
# cp.cuda.runtime.deviceSynchronize()
# useful for debugging:
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node
......
......@@ -2,7 +2,6 @@ from typing import Union
import numpy as np
import pystencils
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol
......@@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend
from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.transformations import (
get_base_buffer_index, get_common_field, parse_base_pointer_info,
......@@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
config: CreateKernelConfig):
function_name = config.function_name
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers
......@@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
index_fields = config.index_fields
function_name = config.function_name
coordinate_names = config.coordinate_names
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
fields_written = assignments.bound_fields
......
......@@ -15,6 +15,12 @@ except ImportError:
import unittest.mock
pytest = unittest.mock.MagicMock()
try:
import cupy.cuda.runtime
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
SCRIPT_FOLDER = Path(__file__).parent.absolute()
INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
......@@ -365,10 +371,11 @@ def test_load_data():
assert np.all(dh.cpu_arrays['dst2']) == 0
def test_array_handler():
@pytest.mark.parametrize("device_number", device_numbers)
def test_array_handler(device_number):
size = (2, 2)
pytest.importorskip('cupy')
array_handler = GPUArrayHandler()
array_handler = GPUArrayHandler(device_number)
zero_array = array_handler.zeros(size)
cpu_array = np.empty(size)
......
import pytest
import numpy as np
import cupy as cp
import sympy as sp
from scipy.ndimage import convolve
import pystencils
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
try:
import cupy
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
def test_averaging_kernel():
size = (40, 55)
......@@ -154,7 +161,8 @@ def test_periodicity():
np.testing.assert_equal(cpu_result, gpu_result)
def test_block_indexing():
@pytest.mark.parametrize("device_number", device_numbers)
def test_block_indexing(device_number):
f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
......@@ -164,16 +172,16 @@ def test_block_indexing():
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
maximum_block_size="auto", device_number=device_number)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs
registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
else:
device = cp.cuda.Device(pystencils.GPU_DEVICE)
device = cp.cuda.Device(device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment