diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index d653d18b1448b306d24d9f08c36e256074312f79..5a4852cc92980232020d9cbc8c2a4ec07cf1db3b 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -17,12 +17,19 @@ from pystencils_benchmark.common import (_env,
 from pystencils_benchmark.enums import Compiler
 
 
+def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
+    lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})"
+    splitted = code.split("void")
+    return splitted[0] + lb_str + "".join(splitted[1:])
+
+
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
                        path: Path = None,
                        *,
                        compiler: Compiler = Compiler.NVCC,
                        timing: bool = True,
-                       cuda_block_size: tuple = (32, 1, 1)
+                       cuda_block_size: tuple = (32, 1, 1),
+                       launch_bounds: tuple = None,
                        ) -> None:
 
     src_path, include_path = setup_directories(path)
@@ -34,10 +41,14 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
         kernel_name = kernel_ast.function_name
 
         header = kernel_header(kernel_ast)
+        if launch_bounds:
+            header = _add_launch_bound(header, launch_bounds)
         with open(include_path / f'{kernel_name}.h', 'w+') as f:
             f.write(header)
 
         source = kernel_source(kernel_ast)
+        if launch_bounds:
+            source = _add_launch_bound(source, launch_bounds)
         # TODO CUDA specific suffix
         with open(src_path / f'{kernel_name}.cu', 'w+') as f:
             f.write(source)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 0c42c7907be4c85b99b46f05acd9270598a42bc3..929d4f8a59d2fec219fb948da9c87e8ff9c31c2f 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -41,7 +41,11 @@ def test_generate(compiler, config_kwarg):
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
 
-def test_generate_gpu():
+gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
+
+
+@pytest.mark.parametrize('kwargs', gpu_kwargs)
+def test_generate_gpu(kwargs):
     compiler = Compiler.NVCC
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
 
@@ -52,6 +56,6 @@ def test_generate_gpu():
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler)
+        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs)
         # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)