Skip to content
Snippets Groups Projects
Commit 32de591c authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'device_selection' into 'master'

Remove pystencils.GPU_DEVICE

See merge request !336
parents 8e92a559 376ee8d3
1 merge request!336Remove pystencils.GPU_DEVICE
Pipeline #54331 passed with stages
in 24 minutes and 32 seconds
...@@ -40,12 +40,3 @@ from ._version import get_versions ...@@ -40,12 +40,3 @@ from ._version import get_versions
__version__ = get_versions()['version'] __version__ = get_versions()['version']
del get_versions del get_versions
# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
try:
import cupy
if cupy.cuda.runtime.getDeviceCount() > 0:
GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
except ImportError:
pass
...@@ -18,6 +18,7 @@ try: ...@@ -18,6 +18,7 @@ try:
import waLBerla as wlb import waLBerla as wlb
if wlb.cpp_available: if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else: else:
ParallelDataHandling = None ParallelDataHandling = None
except ImportError: except ImportError:
...@@ -100,7 +101,7 @@ class BoundaryHandling: ...@@ -100,7 +101,7 @@ class BoundaryHandling:
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling): if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
array_handler = GPUArrayHandler() array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else: else:
array_handler = self.data_handling.array_handler array_handler = self.data_handling.array_handler
......
...@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_layout: str = 'SoA', default_layout: str = 'SoA',
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
parallel: bool = False, parallel: bool = False,
default_ghost_layers: int = 1) -> DataHandling: default_ghost_layers: int = 1,
device_number: Union[int, None] = None) -> DataHandling:
"""Creates a data handling instance. """Creates a data handling instance.
Args: Args:
...@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_target: `Target` default_target: `Target`
parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
specified. If none is given, the device with the largest amount of memory is used. If multiple
devices have the same amount of memory, the one with the lower number is used
""" """
if isinstance(default_target, str): if isinstance(default_target, str):
new_target = Target[default_target.upper()] new_target = Target[default_target.upper()]
...@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...], ...@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
periodicity=periodicity, periodicity=periodicity,
default_target=default_target, default_target=default_target,
default_layout=default_layout, default_layout=default_layout,
default_ghost_layers=default_ghost_layers) default_ghost_layers=default_ghost_layers,
device_number=device_number)
__all__ = ['create_data_handling'] __all__ = ['create_data_handling']
...@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling): ...@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
default_layout: str = 'SoA', default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, periodicity: Union[bool, Sequence[bool]] = False,
default_target: Target = Target.CPU, default_target: Target = Target.CPU,
array_handler=None) -> None: array_handler=None,
device_number=None) -> None:
""" """
Creates a data handling for single node simulations. Creates a data handling for single node simulations.
...@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling): ...@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
domain_size: size of the spatial domain as tuple domain_size: size of the spatial domain as tuple
default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
default_layout: default layout used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method
periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
False (non-periodic)
default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
allocated if not overwritten in add_array, and synchronization functions are for the GPU by allocated if not overwritten in add_array, and synchronization functions are for the GPU by
default default
array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
the device with the largest amount of memory is used. If multiple devices have the same
amount of memory, the one with the lower number is used
""" """
super(SerialDataHandling, self).__init__() super(SerialDataHandling, self).__init__()
self._domainSize = tuple(domain_size) self._domainSize = tuple(domain_size)
...@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling): ...@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):
if not array_handler: if not array_handler:
try: try:
self.array_handler = GPUArrayHandler() if device_number is None:
except Exception: import cupy.cuda.runtime
if cupy.cuda.runtime.getDeviceCount() > 0:
device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
self.array_handler = GPUArrayHandler(device_number)
except ImportError:
self.array_handler = GPUNotAvailableHandler() self.array_handler = GPUNotAvailableHandler()
else: else:
self.array_handler = array_handler self.array_handler = array_handler
......
...@@ -6,30 +6,28 @@ except ImportError: ...@@ -6,30 +6,28 @@ except ImportError:
cpx = None cpx = None
import numpy as np import numpy as np
import pystencils
class GPUArrayHandler: class GPUArrayHandler:
@staticmethod def __init__(self, device_number):
def zeros(shape, dtype=np.float64, order='C'): self._device_number = device_number
with cp.cuda.Device(pystencils.GPU_DEVICE):
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order) return cp.zeros(shape=shape, dtype=dtype, order=order)
@staticmethod def ones(self, shape, dtype=np.float64, order='C'):
def ones(shape, dtype=np.float64, order='C'): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.ones(shape=shape, dtype=dtype, order=order) return cp.ones(shape=shape, dtype=dtype, order=order)
@staticmethod def empty(self, shape, dtype=np.float64, order='C'):
def empty(shape, dtype=np.float64, order='C'): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.empty(shape=shape, dtype=dtype, order=order) return cp.empty(shape=shape, dtype=dtype, order=order)
@staticmethod def to_gpu(self, numpy_array):
def to_gpu(numpy_array):
swaps = _get_index_swaps(numpy_array) swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base) gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps): for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b) gpu_array = gpu_array.swapaxes(a, b)
...@@ -37,27 +35,26 @@ class GPUArrayHandler: ...@@ -37,27 +35,26 @@ class GPUArrayHandler:
else: else:
return cp.asarray(numpy_array) return cp.asarray(numpy_array)
@staticmethod def upload(self, array, numpy_array):
def upload(array, numpy_array): assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base) array.base.set(numpy_array.base)
else: else:
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
array.set(numpy_array) array.set(numpy_array)
@staticmethod def download(self, array, numpy_array):
def download(array, numpy_array): assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get() numpy_array.base[:] = array.base.get()
else: else:
with cp.cuda.Device(pystencils.GPU_DEVICE): with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get() numpy_array[:] = array.get()
@staticmethod def randn(self, shape, dtype=np.float64):
def randn(shape, dtype=np.float64): with cp.cuda.Device(self._device_number):
with cp.cuda.Device(pystencils.GPU_DEVICE):
return cp.random.randn(*shape, dtype=dtype) return cp.random.randn(*shape, dtype=dtype)
@staticmethod @staticmethod
......
import numpy as np import numpy as np
import pystencils
from pystencils.backends.cbackend import get_headers from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType from pystencils.typing import StructType
...@@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
for k, v in kwargs.items())) for k, v in kwargs.items()))
try: try:
args, block_and_thread_numbers = cache[key] args, block_and_thread_numbers = cache[key]
with cp.cuda.Device(pystencils.GPU_DEVICE): device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError: except KeyError:
full_arguments = argument_dict.copy() full_arguments = argument_dict.copy()
...@@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
args = tuple(_build_numpy_argument_list(parameters, full_arguments)) args = tuple(_build_numpy_argument_list(parameters, full_arguments))
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
with cp.cuda.Device(pystencils.GPU_DEVICE): device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging: # useful for debugging:
# with cp.cuda.Device(pystencils.GPU_DEVICE): # cp.cuda.runtime.deviceSynchronize()
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node ast = kernel_function_node
......
...@@ -2,7 +2,6 @@ from typing import Union ...@@ -2,7 +2,6 @@ from typing import Union
import numpy as np import numpy as np
import pystencils
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol from pystencils.typing import StructType, TypedSymbol
...@@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType ...@@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend from pystencils.enums import Target, Backend
from pystencils.gpu.gpujit import make_python_function from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.transformations import ( from pystencils.transformations import (
get_base_buffer_index, get_common_field, parse_base_pointer_info, get_base_buffer_index, get_common_field, parse_base_pointer_info,
...@@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], ...@@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
config: CreateKernelConfig): config: CreateKernelConfig):
function_name = config.function_name function_name = config.function_name
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice = config.iteration_slice iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers ghost_layers = config.ghost_layers
...@@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol ...@@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
index_fields = config.index_fields index_fields = config.index_fields
function_name = config.function_name function_name = config.function_name
coordinate_names = config.coordinate_names coordinate_names = config.coordinate_names
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
fields_written = assignments.bound_fields fields_written = assignments.bound_fields
......
...@@ -15,6 +15,12 @@ except ImportError: ...@@ -15,6 +15,12 @@ except ImportError:
import unittest.mock import unittest.mock
pytest = unittest.mock.MagicMock() pytest = unittest.mock.MagicMock()
try:
import cupy.cuda.runtime
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
SCRIPT_FOLDER = Path(__file__).parent.absolute() SCRIPT_FOLDER = Path(__file__).parent.absolute()
INPUT_FOLDER = SCRIPT_FOLDER / "test_data" INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
...@@ -365,10 +371,11 @@ def test_load_data(): ...@@ -365,10 +371,11 @@ def test_load_data():
assert np.all(dh.cpu_arrays['dst2']) == 0 assert np.all(dh.cpu_arrays['dst2']) == 0
def test_array_handler(): @pytest.mark.parametrize("device_number", device_numbers)
def test_array_handler(device_number):
size = (2, 2) size = (2, 2)
pytest.importorskip('cupy') pytest.importorskip('cupy')
array_handler = GPUArrayHandler() array_handler = GPUArrayHandler(device_number)
zero_array = array_handler.zeros(size) zero_array = array_handler.zeros(size)
cpu_array = np.empty(size) cpu_array = np.empty(size)
......
import pytest
import numpy as np import numpy as np
import cupy as cp import cupy as cp
import sympy as sp import sympy as sp
from scipy.ndimage import convolve from scipy.ndimage import convolve
import pystencils
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
try:
import cupy
device_numbers = range(cupy.cuda.runtime.getDeviceCount())
except ImportError:
device_numbers = []
def test_averaging_kernel(): def test_averaging_kernel():
size = (40, 55) size = (40, 55)
...@@ -154,7 +161,8 @@ def test_periodicity(): ...@@ -154,7 +161,8 @@ def test_periodicity():
np.testing.assert_equal(cpu_result, gpu_result) np.testing.assert_equal(cpu_result, gpu_result)
def test_block_indexing(): @pytest.mark.parametrize("device_number", device_numbers)
def test_block_indexing(device_number):
f = fields("f: [3D]") f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
...@@ -164,16 +172,16 @@ def test_block_indexing(): ...@@ -164,16 +172,16 @@ def test_block_indexing():
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=pystencils.GPU_DEVICE) maximum_block_size="auto", device_number=device_number)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs # This function should be used if number of needed registers is known. Can be determined with func.num_regs
registers_per_thread = 1000 registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread) blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip: if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE) max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
else: else:
device = cp.cuda.Device(pystencils.GPU_DEVICE) device = cp.cuda.Device(device_number)
da = device.attributes da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock") max_registers_per_block = da.get("MaxRegistersPerBlock")
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment