From 9140da63f4446be8b1c9c5f58617654f5fffb485 Mon Sep 17 00:00:00 2001
From: Christoph Alt <christoph.alt@fau.de>
Date: Tue, 8 Aug 2023 16:22:11 +0200
Subject: [PATCH] Added a parameter to insert a launch bounds to the kernel

---
 pystencils_benchmark/gpu/benchmark.py | 13 ++++++++++++-
 tests/test_benchmark.py               |  8 ++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index d653d18..5a4852c 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -17,12 +17,19 @@ from pystencils_benchmark.common import (_env,
 from pystencils_benchmark.enums import Compiler
 
 
+def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
+    lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})"
+    splitted = code.split("void")
+    return splitted[0] + lb_str + "".join(splitted[1:])
+
+
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
                        compiler: Compiler = Compiler.NVCC,
                        timing: bool = True,
-                       cuda_block_size: tuple = (32, 1, 1)
+                       cuda_block_size: tuple = (32, 1, 1),
+                       launch_bounds: tuple = None,
                        ) -> None:
 
     src_path, include_path = setup_directories(path)
@@ -34,10 +41,14 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
         kernel_name = kernel_ast.function_name
 
         header = kernel_header(kernel_ast)
+        if launch_bounds:
+            header = _add_launch_bound(header, launch_bounds)
         with open(include_path / f'{kernel_name}.h', 'w+') as f:
             f.write(header)
 
         source = kernel_source(kernel_ast)
+        if launch_bounds:
+            source = _add_launch_bound(source, launch_bounds)
         # TODO CUDA specific suffix
         with open(src_path / f'{kernel_name}.cu', 'w+') as f:
             f.write(source)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 0c42c79..929d4f8 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -41,7 +41,11 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
-def test_generate_gpu():
+gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
+
+
+@pytest.mark.parametrize('kwargs', gpu_kwargs)
+def test_generate_gpu(kwargs):
     compiler = Compiler.NVCC
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
 
@@ -52,6 +56,6 @@ def test_generate_gpu():
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler)
+        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs)
         # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
-- 
GitLab