Small changes

19846d04 · Markus Holzer · b2a59bbf · 19846d04 · 19846d04
Commit 19846d04 authored 2 years ago by Markus Holzer
--- a/pystencils/gpu/indexing.py
+++ b/pystencils/gpu/indexing.py
@@ -5,7 +5,6 @@ import math
 import sympy as sp
 from sympy.core.cache import cacheit

-import pystencils
 from pystencils.astnodes import Block, Conditional
 from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
@@ -98,11 +97,15 @@ class BlockIndexing(AbstractIndexing):
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
+
    """

    def __init__(self, field, iteration_slice,
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
-                 maximum_block_size=(1024, 1024, 64)):
+                 maximum_block_size=(1024, 1024, 64), device_number=0):
        if field.spatial_dimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

@@ -115,10 +118,9 @@ class BlockIndexing(AbstractIndexing):
            import cupy as cp
            # See https://github.com/cupy/cupy/issues/7676
            if cp.cuda.runtime.is_hip:
-                device_number = pystencils.GPU_DEVICE
                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
            else:
-                da = cp.cuda.Device(pystencils.GPU_DEVICE).attributes
+                da = cp.cuda.Device(device_number).attributes
                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])

        self._maximum_block_size = maximum_block_size
@@ -184,23 +186,23 @@ class BlockIndexing(AbstractIndexing):
        return _iteration_space(self._iterationSlice, arr_shape)

    @staticmethod
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread):
+    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device_number=0):
        """Shrinks the block_size if there are too many registers used per block.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
        They can be obtained by ``func.num_regs`` from a cupy function.
        Args:
            block_size: used block size that is target for limiting
            required_registers_per_thread: needed registers per thread
-        :returns smaller block_size if too many registers are used.
+            device_number: device number of the used GPU. By default, the zeroth device is used.
+        returns: smaller block_size if too many registers are used.
        """
        import cupy as cp
-        device_number = pystencils.GPU_DEVICE

        # See https://github.com/cupy/cupy/issues/7676
        if cp.cuda.runtime.is_hip:
            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
        else:
-            device = cp.cuda.Device(pystencils.GPU_DEVICE)
+            device = cp.cuda.Device(device_number)
            da = device.attributes
            max_registers_per_block = da.get("MaxRegistersPerBlock")


--- a/pystencils_tests/test_gpu.py
+++ b/pystencils_tests/test_gpu.py
@@ -165,7 +165,7 @@ def test_block_indexing():
    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)

    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto")
-    
+
    # This function should be used if number of needed registers is known. Can be determined with func.num_regs
    registers_per_thread = 1000
    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)