Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
pystencils
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pycodegen
pystencils
Commits
c427c469
Commit
c427c469
authored
6 months ago
by
Richard Angersbach
Browse files
Options
Downloads
Patches
Plain Diff
Format gpu_indexing.py
parent
2624d819
No related branches found
No related tags found
1 merge request
!454
Optimization for GPU block size determination
Pipeline
#75759
failed
6 months ago
Stage: Code Quality
Stage: Unit Tests
Stage: legacy_test
Stage: docs
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/pystencils/codegen/gpu_indexing.py
+57
-31
57 additions, 31 deletions
src/pystencils/codegen/gpu_indexing.py
with
57 additions
and
31 deletions
src/pystencils/codegen/gpu_indexing.py
+
57
−
31
View file @
c427c469
...
...
@@ -66,14 +66,17 @@ class GpuLaunchConfiguration(ABC):
@staticmethod
def
block_size_exceeds_hw_limits
(
block_size
:
tuple
[
int
,
...],
hw_props
:
HardwareProperties
)
->
bool
:
block_size
:
tuple
[
int
,
...],
hw_props
:
HardwareProperties
)
->
bool
:
"""
Checks if provided block size conforms limits given by the hardware.
"""
return
any
(
return
(
any
(
size
>
max_size
for
size
,
max_size
in
zip
(
block_size
,
hw_props
.
max_block_sizes
)
)
or
prod
(
block_size
)
>
hw_props
.
max_threads_per_block
)
or
prod
(
block_size
)
>
hw_props
.
max_threads_per_block
)
@staticmethod
def
_gen_error_msg
(
block_size
:
tuple
[
int
,
...]):
...
...
@@ -81,15 +84,15 @@ class GpuLaunchConfiguration(ABC):
Final block size was too large:
{
block_size
}
.
"
@staticmethod
def
_round_block_sizes_to_warp_size
(
to_round
:
list
[
int
],
warp_size
:
int
)
->
tuple
[
int
,
...]:
def
_round_block_sizes_to_warp_size
(
to_round
:
list
[
int
],
warp_size
:
int
)
->
tuple
[
int
,
...]:
# check if already aligns with warp size
if
prod
(
to_round
)
%
warp_size
==
0
:
return
tuple
(
to_round
)
# find index of element closest to warp size and round up
index_to_round
=
to_round
.
index
(
max
(
to_round
,
key
=
lambda
i
:
abs
(
i
%
warp_size
))
)
index_to_round
=
to_round
.
index
(
max
(
to_round
,
key
=
lambda
i
:
abs
(
i
%
warp_size
)))
if
index_to_round
+
1
<
len
(
to_round
):
return
(
*
to_round
[:
index_to_round
],
...
...
@@ -152,7 +155,9 @@ class GpuLaunchConfiguration(ABC):
# case 2: trimmed block is equivalent to the whole iteration space
if
all
(
b
==
i
for
b
,
i
in
zip
(
trimmed
,
it_space
)):
return
check_sizes_and_return
(
cls
.
_round_block_sizes_to_warp_size
(
trimmed
,
hw_props
.
warp_size
))
return
check_sizes_and_return
(
cls
.
_round_block_sizes_to_warp_size
(
trimmed
,
hw_props
.
warp_size
)
)
else
:
# double block size in each dimension until block is large enough (or case 2 triggers)
for
d
in
resize_order
:
...
...
@@ -175,7 +180,11 @@ class GpuLaunchConfiguration(ABC):
# case 3: trim block is large enough
if
prod
(
trimmed
)
>=
hw_props
.
warp_size
:
return
check_sizes_and_return
(
cls
.
_round_block_sizes_to_warp_size
(
trimmed
,
hw_props
.
warp_size
))
return
check_sizes_and_return
(
cls
.
_round_block_sizes_to_warp_size
(
trimmed
,
hw_props
.
warp_size
)
)
raise
CodegenError
(
"
Unable to determine GPU block size for this kernel.
"
)
...
...
@@ -211,8 +220,11 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
# -> round block size in fastest moving dimension up to multiple of warp size
block_size
=
tuple
(
int
(
bs
(
**
kwargs
))
for
bs
in
self
.
_block_size
)
block_size
=
(
(
ceil_to_multiple
(
block_size
[
0
],
self
.
_hw_props
.
warp_size
)
if
self
.
_assume_warp_aligned_block_size
else
block_size
[
0
],
if
self
.
_assume_warp_aligned_block_size
else
block_size
[
0
]
),
*
block_size
[
1
:],
)
...
...
@@ -233,9 +245,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
"""
def
__init__
(
self
,
hw_props
:
HardwareProperties
,
assume_warp_aligned_block_size
:
bool
=
False
self
,
hw_props
:
HardwareProperties
,
assume_warp_aligned_block_size
:
bool
=
False
)
->
None
:
self
.
_assume_warp_aligned_block_size
=
assume_warp_aligned_block_size
...
...
@@ -271,9 +281,14 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
if
self
.
_grid_size
is
None
:
raise
AttributeError
(
"
No GPU grid size was set by the user.
"
)
if
self
.
_assume_warp_aligned_block_size
and
prod
(
self
.
_block_size
)
%
self
.
_hw_props
.
warp_size
!=
0
:
raise
CodegenError
(
"
Specified block sizes must align with warp size with
"
"
`assume_warp_aligned_block_size` enabled.
"
)
if
(
self
.
_assume_warp_aligned_block_size
and
prod
(
self
.
_block_size
)
%
self
.
_hw_props
.
warp_size
!=
0
):
raise
CodegenError
(
"
Specified block sizes must align with warp size with
"
"
`assume_warp_aligned_block_size` enabled.
"
)
return
self
.
_block_size
,
self
.
_grid_size
...
...
@@ -347,17 +362,26 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
computed_block_size
:
tuple
[
int
,
...]
try
:
if
self
.
_use_block_size_fitting
:
computed_block_size
=
self
.
fit_block_size
(
num_work_items
,
self
.
_block_size
,
self
.
_hw_props
)
computed_block_size
=
self
.
fit_block_size
(
num_work_items
,
self
.
_block_size
,
self
.
_hw_props
)
else
:
computed_block_size
=
self
.
trim_block_size
(
num_work_items
,
self
.
_block_size
,
self
.
_hw_props
)
computed_block_size
=
self
.
trim_block_size
(
num_work_items
,
self
.
_block_size
,
self
.
_hw_props
)
# check if assumption for warp size alignment is met
if
self
.
_assume_warp_aligned_block_size
and
prod
(
computed_block_size
)
%
self
.
_hw_props
.
warp_size
!=
0
:
if
(
self
.
_assume_warp_aligned_block_size
and
prod
(
computed_block_size
)
%
self
.
_hw_props
.
warp_size
!=
0
):
raise
CodegenError
(
"
Adapted block size is not divisible by warp size.
"
)
except
CodegenError
as
e
:
warn
(
f
"
CodeGenError occurred:
{
getattr
(
e
,
'
message
'
,
repr
(
e
))
}
.
"
warn
(
f
"
CodeGenError occurred:
{
getattr
(
e
,
'
message
'
,
repr
(
e
))
}
.
"
f
"
Block size fitting could not determine optimal block size configuration.
"
f
"
Defaulting back to
{
self
.
_block_size
}
"
)
f
"
Defaulting back to
{
self
.
_block_size
}
"
)
computed_block_size
=
self
.
_block_size
adapted_block_size
=
cast
(
dim3
,
computed_block_size
)
...
...
@@ -457,8 +481,11 @@ class GpuIndexing:
def
get_launch_config_factory
(
self
)
->
Callable
[[],
GpuLaunchConfiguration
]:
"""
Retrieve a factory for the launch configuration for later consumption by the runtime system
"""
if
self
.
_manual_launch_grid
:
def
factory
():
return
ManualLaunchConfiguration
(
self
.
_hw_props
,
self
.
_assume_warp_aligned_block_size
)
return
ManualLaunchConfiguration
(
self
.
_hw_props
,
self
.
_assume_warp_aligned_block_size
)
return
factory
...
...
@@ -481,8 +508,7 @@ class GpuIndexing:
)
work_items_expr
+=
tuple
(
self
.
_ast_factory
.
parse_index
(
1
)
for
_
in
range
(
3
-
rank
)
self
.
_ast_factory
.
parse_index
(
1
)
for
_
in
range
(
3
-
rank
)
)
num_work_items
=
cast
(
...
...
@@ -545,7 +571,7 @@ class GpuIndexing:
block_size
,
cast
(
_Dim3Lambda
,
grid_size
),
self
.
_hw_props
,
self
.
_assume_warp_aligned_block_size
self
.
_assume_warp_aligned_block_size
,
)
return
factory
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment