Skip to content
Snippets Groups Projects
Commit f65cd9bf authored by Richard Angersbach's avatar Richard Angersbach
Browse files

Create expression for ceil_to_multiple manually

parent 1a2b8f74
No related branches found
No related tags found
1 merge request!454Optimization for GPU block size determination
Pipeline #76543 failed
......@@ -20,7 +20,7 @@ from ..backend.kernelcreation import (
)
from ..backend.platforms.cuda import ThreadMapping
from ..backend.ast.expressions import PsExpression
from ..backend.ast.expressions import PsExpression, PsIntDiv
from math import prod
from ..utils import ceil_to_multiple
......@@ -605,9 +605,10 @@ class GpuIndexing:
# -> round block size in fastest moving dimension up to multiple of warp size
rounded_block_size: PsExpression
if self._assume_warp_aligned_block_size:
rounded_block_size = ceil_to_multiple(
work_items[0],
PsExpression.make(PsConstant(self._hw_props.warp_size, work_items[0].dtype)))
warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
rounded_block_size = self._ast_factory.parse_index(
PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
warp_size.clone()) * warp_size.clone())
else:
rounded_block_size = work_items[0]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment