Skip to content
Snippets Groups Projects
Commit fdaa6f10 authored by Markus Holzer's avatar Markus Holzer
Browse files

Small changes

parent a2253b0a
No related branches found
No related tags found
1 merge request!335Fix indexing for AMD GPUs
Pipeline #54086 failed
...@@ -105,7 +105,7 @@ class BlockIndexing(AbstractIndexing): ...@@ -105,7 +105,7 @@ class BlockIndexing(AbstractIndexing):
def __init__(self, field, iteration_slice, def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64), device_number=0): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
...@@ -114,6 +114,7 @@ class BlockIndexing(AbstractIndexing): ...@@ -114,6 +114,7 @@ class BlockIndexing(AbstractIndexing):
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits # Get device limits
import cupy as cp import cupy as cp
# See https://github.com/cupy/cupy/issues/7676 # See https://github.com/cupy/cupy/issues/7676
...@@ -186,27 +187,22 @@ class BlockIndexing(AbstractIndexing): ...@@ -186,27 +187,22 @@ class BlockIndexing(AbstractIndexing):
def iteration_space(self, arr_shape): def iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape) return _iteration_space(self._iterationSlice, arr_shape)
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread, device_number): def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block. """Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation. This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function. They can be obtained by ``func.num_regs`` from a cupy function.
Args: Args:
block_size: used block size that is target for limiting block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread required_registers_per_thread: needed registers per thread
device_number: device number of the used GPU. By default, the zeroth device is used.
returns: smaller block_size if too many registers are used. returns: smaller block_size if too many registers are used.
""" """
import cupy as cp import cupy as cp
if device_number != self._device_number:
warnings.warn(f"BlockIndexing was set up with device number: {self._device_number}, but for limiting"
f"the GPU blocks to the hardware device number {device_number} was used.")
# See https://github.com/cupy/cupy/issues/7676 # See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip: if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number) max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else: else:
device = cp.cuda.Device(device_number) device = cp.cuda.Device(self._device_number)
da = device.attributes da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock") max_registers_per_block = da.get("MaxRegistersPerBlock")
......
...@@ -163,11 +163,12 @@ def test_block_indexing(): ...@@ -163,11 +163,12 @@ def test_block_indexing():
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto") bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs # This function should be used if number of needed registers is known. Can be determined with func.num_regs
registers_per_thread = 1000 registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread, pystencils.GPU_DEVICE) blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip: if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE) max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment