Skip to content
Snippets Groups Projects
Commit a892311d authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Merge branch 'FixIndexing' into 'master'

Fix indexing for AMD GPUs

See merge request !335
parents b9d9c5cc 145c5264
No related branches found
No related tags found
1 merge request!335Fix indexing for AMD GPUs
Pipeline #54101 failed
import abc import abc
from functools import partial from functools import partial
import math
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional
from pystencils.typing import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor
...@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing): ...@@ -97,11 +97,14 @@ class BlockIndexing(AbstractIndexing):
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
""" """
def __init__(self, field, iteration_slice, def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
...@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing): ...@@ -110,17 +113,22 @@ class BlockIndexing(AbstractIndexing):
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits # Get device limits
import cupy as cp import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE) # See https://github.com/cupy/cupy/issues/7676
da = device.attributes if cp.cuda.runtime.is_hip:
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"]) maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
else:
da = cp.cuda.Device(device_number).attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property @property
def cuda_indices(self): def cuda_indices(self):
...@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing): ...@@ -178,32 +186,34 @@ class BlockIndexing(AbstractIndexing):
def iteration_space(self, arr_shape): def iteration_space(self, arr_shape):
return _iteration_space(self._iterationSlice, arr_shape) return _iteration_space(self._iterationSlice, arr_shape)
@staticmethod def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): """Shrinks the block_size if there are too many registers used per block.
"""Shrinks the block_size if there are too many registers used per multiprocessor.
This is not done automatically, since the required_registers_per_thread are not known before compilation. This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function. They can be obtained by ``func.num_regs`` from a cupy function.
:returns smaller block_size if too many registers are used. Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
""" """
import cupy as cp import cupy as cp
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
available_registers_per_mp = da.get("MaxRegistersPerMultiprocessor") # See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
block = block_size max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True: while True:
num_threads = 1 required_registers = math.prod(result) * required_registers_per_thread
for t in block: if required_registers <= max_registers_per_block:
num_threads *= t return result
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
else: else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e]) largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert block[largest_grid_entry_idx] >= 2 assert result[largest_list_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2 result[largest_list_entry_idx] //= 2
@staticmethod @staticmethod
def permute_block_size_according_to_layout(block_size, layout): def permute_block_size_according_to_layout(block_size, layout):
......
...@@ -2,6 +2,7 @@ from typing import Union ...@@ -2,6 +2,7 @@ from typing import Union
import numpy as np import numpy as np
import pystencils
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol from pystencils.typing import StructType, TypedSymbol
...@@ -10,7 +11,7 @@ from pystencils.field import Field, FieldType ...@@ -10,7 +11,7 @@ from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend from pystencils.enums import Target, Backend
from pystencils.gpu.gpujit import make_python_function from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.transformations import ( from pystencils.transformations import (
get_base_buffer_index, get_common_field, parse_base_pointer_info, get_base_buffer_index, get_common_field, parse_base_pointer_info,
...@@ -21,6 +22,8 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], ...@@ -21,6 +22,8 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
config: CreateKernelConfig): config: CreateKernelConfig):
function_name = config.function_name function_name = config.function_name
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice = config.iteration_slice iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers ghost_layers = config.ghost_layers
...@@ -120,6 +123,8 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol ...@@ -120,6 +123,8 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
index_fields = config.index_fields index_fields = config.index_fields
function_name = config.function_name function_name = config.function_name
coordinate_names = config.coordinate_names coordinate_names = config.coordinate_names
if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
fields_written = assignments.bound_fields fields_written = assignments.bound_fields
......
...@@ -3,6 +3,7 @@ import cupy as cp ...@@ -3,6 +3,7 @@ import cupy as cp
import sympy as sp import sympy as sp
from scipy.ndimage import convolve from scipy.ndimage import convolve
import pystencils
from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
from pystencils.gpu import BlockIndexing from pystencils.gpu import BlockIndexing
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
...@@ -162,8 +163,19 @@ def test_block_indexing(): ...@@ -162,8 +163,19 @@ def test_block_indexing():
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto") bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
# This function should be used if number of needed registers is known. Can be determined with func.num_regs # This function should be used if number of needed registers is known. Can be determined with func.num_regs
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000) registers_per_thread = 1000
blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
else:
device = cp.cuda.Device(pystencils.GPU_DEVICE)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
assert np.prod(blocks) * registers_per_thread < max_registers_per_block
assert sum(blocks) < sum([1024, 1024, 1])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment