diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py
index 9a012d6c2a75c98faae05e6815dd3883c7d4d2e4..1938fc80528fc14531d2c7baf20e182439857039 100644
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
@@ -10,8 +10,10 @@ from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
 from pystencils.data_types import get_base_type
 from pystencils.include import get_pystencils_include_path
+from pystencils.integer_functions import modulo_ceil
 from pystencils.sympyextensions import prod
 
+import numpy as np
 
 def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
     """Return C code of a benchmark program for the given kernel.
@@ -37,7 +39,18 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
             assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
             field = accessed_fields[p.field_name]
             dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
+            np_dtype = np.dtype(dtype)
+
+            size_data_type = np_dtype.itemsize
+            elements = prod(field.shape)
+            align = 64
+            required_size = size_data_type * elements + align
+            size = modulo_ceil(required_size, align)
+
+            assert align % np_dtype.itemsize == 0
+            offset = int(-ast.ghost_layers[0][0] % (align / np_dtype.itemsize))
+
+            fields.append((p.field_name, dtype, elements, size, offset))
             call_parameters.append(p.field_name)
 
     header_list = get_headers(ast)
@@ -99,10 +112,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
     compiler_config = get_compiler_config()
     compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
     compile_cmd += [*extra_flags,
-                    kerncraft_path / 'headers' / 'timing.c',
-                    kerncraft_path / 'headers' / 'dummy.c',
-                    path / 'bench.c',
-                    '-o', path / 'bench',
+                    str(kerncraft_path / 'headers' / 'timing.c'),
+                    str(kerncraft_path / 'headers' / 'dummy.c'),
+                    str(path / 'bench.c'),
+                    '-o', str(path / 'bench'),
                     ]
     run_compile_step(compile_cmd)
 
diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c
index ae70ddd6775a45c0709e95d57cef061da2a4b6b0..0539b501ccd554329db48550473f24d1bb555353 100644
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
@@ -28,11 +28,11 @@ int main(int argc, char **argv)
   likwid_markerInit();
   {%- endif %}
 
-  {%- for field_name, dataType, size in fields %}
+  {%- for field_name, dataType, elements, size, offset in fields %}
 
   // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
+  double * {{field_name}} = (double *) aligned_alloc(64, {{size}}) + {{offset}};
+  for (unsigned long long i = 0; i < {{elements}}; ++i)
     {{field_name}}[i] = 0.23;
 
   if(var_false)
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
       {{kernelName}}({{call_argument_list}});
 
       // Dummy calls
-      {%- for field_name, dataType, size in fields %}
+      {%- for field_name, dataType, elements, size, offset in fields %}
       if(var_false) dummy((void*){{field_name}});
       {%- endfor %}
       {%- for constantName, dataType in constants %}
@@ -105,4 +105,8 @@ int main(int argc, char **argv)
   {%- if likwid %}
   likwid_markerClose();
   {%- endif %}
+
+  {%- for field_name, dataType, elements, size, offset in fields %}
+  free({{field_name}} - {{offset}});
+  {%- endfor %}
 }