diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py
index 5a4852cc92980232020d9cbc8c2a4ec07cf1db3b..96fb58cada928da92aa55b50a344f694be6c78cc 100644
--- a/pystencils_benchmark/gpu/benchmark.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -18,9 +18,13 @@ from pystencils_benchmark.enums import Compiler
 
 
 def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
-    lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})"
-    splitted = code.split("void")
-    return splitted[0] + lb_str + "".join(splitted[1:])
+    lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
+    splitted = code.split("void ")
+    prefix = splitted[0]
+    if code.startswith("void "):
+        # just in case that there is nothing before the first void
+        prefix = ""
+    return prefix + "void " + lb_str + "void ".join(splitted[1:])
 
 
 def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
diff --git a/tests/test_launch_bounds.py b/tests/test_launch_bounds.py
new file mode 100644
index 0000000000000000000000000000000000000000..48af06d028241c43c738dcba28f3c732f03662b8
--- /dev/null
+++ b/tests/test_launch_bounds.py
@@ -0,0 +1,19 @@
+import numpy as np
+import pystencils as ps
+from pystencils_benchmark.gpu.benchmark import kernel_header, _add_launch_bound, kernel_source
+
+
+def test_launch_bounds():
+    a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
+
+    @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
+    def vadd():
+        a[0] @= b[0] + c[0]
+    kernel_vadd = ps.create_kernel(**vadd)
+    launch_bounds = (256, 2)
+    header = kernel_header(kernel_vadd)
+    header = _add_launch_bound(header, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in header
+    source = kernel_source(kernel_vadd)
+    source = _add_launch_bound(source, launch_bounds)
+    assert "void __launch_bounds__(256, 2)" in source