Merge branch 'device_selection' into 'master'

Remove pystencils.GPU_DEVICE See merge request pycodegen/pystencils!336

Merge branch 'device_selection' into 'master'
32de591c · Markus Holzer · 8e92a559 · 376ee8d3 · 32de591c · 32de591c
Commit 32de591c authored 2 years ago by Markus Holzer
--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
@@ -40,12 +40,3 @@ from ._version import get_versions

 __version__ = get_versions()['version']
 del get_versions
-
-# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
-try:
-    import cupy
-    if cupy.cuda.runtime.getDeviceCount() > 0:
-        GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
-                            key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
-except ImportError:
-    pass
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
@@ -18,6 +18,7 @@ try:
    import waLBerla as wlb
    if wlb.cpp_available:
        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
    else:
        ParallelDataHandling = None
 except ImportError:
@@ -100,7 +101,7 @@ class BoundaryHandling:
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")

        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = GPUArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
        else:
            array_handler = self.data_handling.array_handler


--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                         default_layout: str = 'SoA',
                         default_target: Target = Target.CPU,
                         parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
    """Creates a data handling instance.

    Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
        default_target: `Target`
        parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
        default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
    """
    if isinstance(default_target, str):
        new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                  periodicity=periodicity,
                                  default_target=default_target,
                                  default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)


 __all__ = ['create_data_handling']
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
                 default_layout: str = 'SoA',
                 periodicity: Union[bool, Sequence[bool]] = False,
                 default_target: Target = Target.CPU,
-                 array_handler=None) -> None:
+                 array_handler=None,
+                 device_number=None) -> None:
        """
        Creates a data handling for single node simulations.

@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):

        if not array_handler:
            try:
-                self.array_handler = GPUArrayHandler()
-            except Exception:
+                if device_number is None:
+                    import cupy.cuda.runtime
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
                self.array_handler = GPUNotAvailableHandler()
        else:
            self.array_handler = array_handler

--- a/pystencils/gpu/gpu_array_handler.py
+++ b/pystencils/gpu/gpu_array_handler.py
@@ -6,30 +6,28 @@ except ImportError:
    cpx = None

 import numpy as np
-import pystencils


 class GPUArrayHandler:
-    @staticmethod
-    def zeros(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def __init__(self, device_number):
+        self._device_number = device_number
+
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
            return cp.zeros(shape=shape, dtype=dtype, order=order)

-    @staticmethod
-    def ones(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
            return cp.ones(shape=shape, dtype=dtype, order=order)

-    @staticmethod
-    def empty(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
            return cp.empty(shape=shape, dtype=dtype, order=order)

-    @staticmethod
-    def to_gpu(numpy_array):
+    def to_gpu(self, numpy_array):
        swaps = _get_index_swaps(numpy_array)
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                gpu_array = cp.asarray(numpy_array.base)
            for a, b in reversed(swaps):
                gpu_array = gpu_array.swapaxes(a, b)
@@ -37,27 +35,26 @@ class GPUArrayHandler:
        else:
            return cp.asarray(numpy_array)

-    @staticmethod
-    def upload(array, numpy_array):
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                array.base.set(numpy_array.base)
        else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                array.set(numpy_array)

-    @staticmethod
-    def download(array, numpy_array):
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                numpy_array.base[:] = array.base.get()
        else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                numpy_array[:] = array.get()

-    @staticmethod
-    def randn(shape, dtype=np.float64):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
            return cp.random.randn(*shape, dtype=dtype)

    @staticmethod

--- a/pystencils/gpu/gpujit.py
+++ b/pystencils/gpu/gpujit.py
 import numpy as np

-import pystencils
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
 from pystencils.typing import StructType
@@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                         for k, v in kwargs.items()))
        try:
            args, block_and_thread_numbers = cache[key]
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
        except KeyError:
            full_arguments = argument_dict.copy()
@@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            args = tuple(_build_numpy_argument_list(parameters, full_arguments))
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
-            # useful for debugging:
-            # with cp.cuda.Device(pystencils.GPU_DEVICE):
-            #     cp.cuda.runtime.deviceSynchronize()
+                # useful for debugging:
+                # cp.cuda.runtime.deviceSynchronize()

        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
    ast = kernel_function_node

--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
@@ -2,7 +2,6 @@ from typing import Union

 import numpy as np

-import pystencils
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.config import CreateKernelConfig
 from pystencils.typing import StructType, TypedSymbol
@@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
 from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
-from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing
+from pystencils.gpu.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.transformations import (
    get_base_buffer_index, get_common_field, parse_base_pointer_info,
@@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                       config: CreateKernelConfig):

    function_name = config.function_name
-    if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
-        config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
    iteration_slice = config.iteration_slice
    ghost_layers = config.ghost_layers
@@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
    index_fields = config.index_fields
    function_name = config.function_name
    coordinate_names = config.coordinate_names
-    if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
-        config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)

    fields_written = assignments.bound_fields

--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -15,6 +15,12 @@ except ImportError:
    import unittest.mock
    pytest = unittest.mock.MagicMock()

+try:
+    import cupy.cuda.runtime
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+
 SCRIPT_FOLDER = Path(__file__).parent.absolute()
 INPUT_FOLDER = SCRIPT_FOLDER / "test_data"

@@ -365,10 +371,11 @@ def test_load_data():
    assert np.all(dh.cpu_arrays['dst2']) == 0


-def test_array_handler():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_array_handler(device_number):
    size = (2, 2)
    pytest.importorskip('cupy')
-    array_handler = GPUArrayHandler()
+    array_handler = GPUArrayHandler(device_number)

    zero_array = array_handler.zeros(size)
    cpu_array = np.empty(size)

--- a/pystencils_tests/test_gpu.py
+++ b/pystencils_tests/test_gpu.py
+import pytest
+
 import numpy as np
 import cupy as cp
 import sympy as sp
 from scipy.ndimage import convolve

-import pystencils
 from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers

+try:
+    import cupy
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+

 def test_averaging_kernel():
    size = (40, 55)
@@ -154,7 +161,8 @@ def test_periodicity():
    np.testing.assert_equal(cpu_result, gpu_result)


-def test_block_indexing():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_block_indexing(device_number):
    f = fields("f: [3D]")
    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
@@ -164,16 +172,16 @@ def test_block_indexing():
    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)

    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
-                       maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
+                       maximum_block_size="auto", device_number=device_number)

    # This function should be used if number of needed registers is known. Can be determined with func.num_regs
    registers_per_thread = 1000
    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)

    if cp.cuda.runtime.is_hip:
-        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
+        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
    else:
-        device = cp.cuda.Device(pystencils.GPU_DEVICE)
+        device = cp.cuda.Device(device_number)
        da = device.attributes
        max_registers_per_block = da.get("MaxRegistersPerBlock")