diff --git a/pystencils/gpu/indexing.py b/pystencils/gpu/indexing.py index c6138616405279ed7b93454192e9dd63946c2345..e8e2e99f5eee4ab5b8b5caae6386178d9916e958 100644 --- a/pystencils/gpu/indexing.py +++ b/pystencils/gpu/indexing.py @@ -105,7 +105,7 @@ class BlockIndexing(AbstractIndexing): def __init__(self, field, iteration_slice, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, - maximum_block_size=(1024, 1024, 64), device_number=0): + maximum_block_size=(1024, 1024, 64), device_number=None): if field.spatial_dimensions > 3: raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") @@ -114,6 +114,7 @@ class BlockIndexing(AbstractIndexing): self._block_size = block_size if maximum_block_size == 'auto': + assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated' # Get device limits import cupy as cp # See https://github.com/cupy/cupy/issues/7676 @@ -186,27 +187,22 @@ class BlockIndexing(AbstractIndexing): def iteration_space(self, arr_shape): return _iteration_space(self._iterationSlice, arr_shape) - def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread, device_number): + def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread): """Shrinks the block_size if there are too many registers used per block. This is not done automatically, since the required_registers_per_thread are not known before compilation. They can be obtained by ``func.num_regs`` from a cupy function. Args: block_size: used block size that is target for limiting required_registers_per_thread: needed registers per thread - device_number: device number of the used GPU. By default, the zeroth device is used. returns: smaller block_size if too many registers are used. """ import cupy as cp - if device_number != self._device_number: - warnings.warn(f"BlockIndexing was set up with device number: {self._device_number}, but for limiting" - f"the GPU blocks to the hardware device number {device_number} was used.") - # See https://github.com/cupy/cupy/issues/7676 if cp.cuda.runtime.is_hip: - max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number) + max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number) else: - device = cp.cuda.Device(device_number) + device = cp.cuda.Device(self._device_number) da = device.attributes max_registers_per_block = da.get("MaxRegistersPerBlock") diff --git a/pystencils_tests/test_gpu.py b/pystencils_tests/test_gpu.py index aa86dcd31f955ef4fb64289fafeab804342e6e1e..b0af7950da352c633d73d7115bf1ea661f884937 100644 --- a/pystencils_tests/test_gpu.py +++ b/pystencils_tests/test_gpu.py @@ -163,11 +163,12 @@ def test_block_indexing(): bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) - bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto") + bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), + maximum_block_size="auto", device_number=pystencils.GPU_DEVICE) # This function should be used if number of needed registers is known. Can be determined with func.num_regs registers_per_thread = 1000 - blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread, pystencils.GPU_DEVICE) + blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread) if cp.cuda.runtime.is_hip: max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)