diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py index d653d18b1448b306d24d9f08c36e256074312f79..5a4852cc92980232020d9cbc8c2a4ec07cf1db3b 100644 --- a/pystencils_benchmark/gpu/benchmark.py +++ b/pystencils_benchmark/gpu/benchmark.py @@ -17,12 +17,19 @@ from pystencils_benchmark.common import (_env, from pystencils_benchmark.enums import Compiler +def _add_launch_bound(code: str, launch_bounds: tuple) -> str: + lb_str = f"__launch_bounds__({','.join(str(lb) for lb in launch_bounds)})" + splitted = code.split("void") + return splitted[0] + lb_str + "".join(splitted[1:]) + + def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, compiler: Compiler = Compiler.NVCC, timing: bool = True, - cuda_block_size: tuple = (32, 1, 1) + cuda_block_size: tuple = (32, 1, 1), + launch_bounds: tuple = None, ) -> None: src_path, include_path = setup_directories(path) @@ -34,10 +41,14 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], kernel_name = kernel_ast.function_name header = kernel_header(kernel_ast) + if launch_bounds: + header = _add_launch_bound(header, launch_bounds) with open(include_path / f'{kernel_name}.h', 'w+') as f: f.write(header) source = kernel_source(kernel_ast) + if launch_bounds: + source = _add_launch_bound(source, launch_bounds) # TODO CUDA specific suffix with open(src_path / f'{kernel_name}.cu', 'w+') as f: f.write(source) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 0c42c7907be4c85b99b46f05acd9270598a42bc3..929d4f8a59d2fec219fb948da9c87e8ff9c31c2f 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -41,7 +41,11 @@ def test_generate(compiler, config_kwarg): subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) -def test_generate_gpu(): +gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)}) + + +@pytest.mark.parametrize('kwargs', gpu_kwargs) +def test_generate_gpu(kwargs): compiler = Compiler.NVCC a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) @@ -52,6 +56,6 @@ def test_generate_gpu(): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler) + pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler, **kwargs) # subprocess.run(['make', '-C', f'{temp_dir}'], check=True) # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)