Skip to content
Snippets Groups Projects
Commit b2a59bbf authored by Markus Holzer's avatar Markus Holzer
Browse files

Fix indexing for AMD GPUs

parent e20d47da
No related branches found
No related tags found
1 merge request!335Fix indexing for AMD GPUs
Pipeline #53976 failed
import abc
from functools import partial
import math
import sympy as sp
from sympy.core.cache import cacheit
......@@ -112,8 +113,12 @@ class BlockIndexing(AbstractIndexing):
if maximum_block_size == 'auto':
# Get device limits
import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
device_number = pystencils.GPU_DEVICE
maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
else:
da = cp.cuda.Device(pystencils.GPU_DEVICE).attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size
......@@ -179,31 +184,35 @@ class BlockIndexing(AbstractIndexing):
return _iteration_space(self._iterationSlice, arr_shape)
@staticmethod
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
"""Shrinks the block_size if there are too many registers used per multiprocessor.
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function.
Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
:returns smaller block_size if too many registers are used.
"""
import cupy as cp
device_number = pystencils.GPU_DEVICE
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
else:
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
available_registers_per_mp = da.get("MaxRegistersPerMultiprocessor")
block = block_size
result = list(block_size)
while True:
num_threads = 1
for t in block:
num_threads *= t
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
required_registers = math.prod(result) * required_registers_per_thread
if required_registers <= max_registers_per_block:
return result
else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
assert block[largest_grid_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2
largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert result[largest_list_entry_idx] >= 2
result[largest_list_entry_idx] //= 2
@staticmethod
def permute_block_size_according_to_layout(block_size, layout):
......
......@@ -3,10 +3,12 @@ import cupy as cp
import sympy as sp
from scipy.ndimage import convolve
import pystencils
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
device_number = pystencils.GPU_DEVICE
def test_averaging_kernel():
......@@ -163,7 +165,17 @@ def test_block_indexing():
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto")
# This function should be used if number of needed registers is known. Can be determined with func.num_regs
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000)
registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
else:
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
assert np.prod(blocks) * registers_per_thread < max_registers_per_block
assert sum(blocks) < sum([1024, 1024, 1])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment