diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py
index 8142abebe0da8587b7b00a59d02d9396af1a0c04..86d8f5609267f3a1b173c36a327e36799e581cdc 100644
--- a/pystencils_benchmark/__init__.py
+++ b/pystencils_benchmark/__init__.py
@@ -1,3 +1,3 @@
 from .enums import Compiler
-from .benchmark import generate_benchmark, kernel_header, kernel_source
-from .benchmark_gpu import generate_benchmark_gpu
+from . import gpu
+from . import cpu
diff --git a/pystencils_benchmark/cpu/__init__.py b/pystencils_benchmark/cpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfd889e3fa0ae6a4c9ce20dd81228868cf3e5c59
--- /dev/null
+++ b/pystencils_benchmark/cpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/cpu/benchmark.py
similarity index 100%
rename from pystencils_benchmark/benchmark.py
rename to pystencils_benchmark/cpu/benchmark.py
diff --git a/pystencils_benchmark/gpu/__init__.py b/pystencils_benchmark/gpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfd889e3fa0ae6a4c9ce20dd81228868cf3e5c59
--- /dev/null
+++ b/pystencils_benchmark/gpu/__init__.py
@@ -0,0 +1 @@
+from .benchmark import generate_benchmark
diff --git a/pystencils_benchmark/benchmark_gpu.py b/pystencils_benchmark/gpu/benchmark.py
similarity index 92%
rename from pystencils_benchmark/benchmark_gpu.py
rename to pystencils_benchmark/gpu/benchmark.py
index d0ccbe14dbb512c689fad9dbcee41bae9b260bd8..d653d18b1448b306d24d9f08c36e256074312f79 100644
--- a/pystencils_benchmark/benchmark_gpu.py
+++ b/pystencils_benchmark/gpu/benchmark.py
@@ -17,13 +17,13 @@ from pystencils_benchmark.common import (_env,
 from pystencils_benchmark.enums import Compiler
 
 
-def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
-                           path: Path = None,
-                           *,
-                           compiler: Compiler = Compiler.NVCC,
-                           timing: bool = True,
-                           cuda_block_size: tuple = (32, 1, 1)
-                           ) -> None:
+def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
+                       path: Path = None,
+                       *,
+                       compiler: Compiler = Compiler.NVCC,
+                       timing: bool = True,
+                       cuda_block_size: tuple = (32, 1, 1)
+                       ) -> None:
 
     src_path, include_path = setup_directories(path)
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 48207dad60960914081a5324e8873dceb76f4cc4..0c42c7907be4c85b99b46f05acd9270598a42bc3 100755
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -6,7 +6,9 @@ import tempfile
 import pytest
 import pystencils as ps
 from pathlib import Path
-from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
+
+from pystencils_benchmark import Compiler
+import pystencils_benchmark as pb
 
 
 compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
@@ -34,7 +36,7 @@ def test_generate(compiler, config_kwarg):
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
+        pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
         subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
 
@@ -50,6 +52,6 @@ def test_generate_gpu():
 
     with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
         temp_dir = Path(temp_dir)
-        generate_benchmark_gpu(kernel_vadd, temp_dir, compiler=compiler)
+        pb.gpu.generate_benchmark(kernel_vadd, temp_dir, compiler=compiler)
         # subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
         # subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
diff --git a/ve_example/test.py b/ve_example/test.py
index ee5e32bf78fdde057943c13c9b523db671ce7b3d..9bb91d6abbab126276a02be8e5e739a640c86052 100755
--- a/ve_example/test.py
+++ b/ve_example/test.py
@@ -4,15 +4,16 @@ import subprocess
 import numpy as np
 import sympy as sp
 import pystencils as ps
-from pystencils_benchmark import generate_benchmark, Compiler
+import pystencils_benchmark as pb
 from pathlib import Path
 
 
-def generate(path: Path, compiler: Compiler):
+def generate(path: Path, compiler: pb.Compiler):
     a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
     alpha = sp.symbols('alpha')
 
     kernels = []
+
     @ps.kernel_config(ps.CreateKernelConfig())
     def vadd():
         a[0] @= b[0] + c[0]
@@ -33,20 +34,20 @@ def generate(path: Path, compiler: Compiler):
         b[0] @= alpha * a[0] + b[0]
     kernels.append(ps.create_kernel(**daxpy_vector))
 
-    generate_benchmark(kernels, path, compiler=compiler)
+    pb.cpu.generate_benchmark(kernels, path, compiler=compiler)
 
 
 def make(path: Path):
     subprocess.run(['make'], check=True)
 
 
-def execute(path: Path, compiler: Compiler):
+def execute(path: Path, compiler: pb.Compiler):
     subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True)
 
 
 def main():
-    compiler = Compiler.GCCdebug
-    path = Path.cwd()
+    compiler = pb.Compiler.GCCdebug
+    path = Path.cwd() / 'generated'
     generate(path, compiler)
     make(path)
     execute(path, compiler)
@@ -54,4 +55,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-