PACXX benchmark generation

e9eb29b9 · Martin Bauer · 649e82b4 · e9eb29b9 · e9eb29b9
Commit e9eb29b9 authored Jan 8, 2019 by Martin Bauer
--- a/pacxx/benchmark.py
+++ b/pacxx/benchmark.py
+import os
+from time import perf_counter
+import subprocess
+from tempfile import TemporaryDirectory
+from pystencils import create_data_handling
+from pystencils.backends.cbackend import CBackend
+from jinja2 import Environment, FileSystemLoader
+from pystencils.backends.cbackend import generate_c
+script_path = os.path.dirname(os.path.realpath(__file__))
+PAXX_ROOT = '/local/bauer/code/pacxx/install'
+DEFAULT_PAXX_COMPILE_OPTIONS = ('-Ofast', '-march=native')
+def generate_benchmark_code(target_file, kernel_ast, target):
+    assert target in ('cpu', 'gpu')
+    assert hasattr(kernel_ast, 'indexing'), "AST has to be a CUDA kernel in order to create a PACXX kernel from it"
+    backend = CBackend()
+    function_body = kernel_ast.body
+    f_sizes = {f.shape[-1] for f in kernel_ast.fields_accessed}
+    assert len(f_sizes) == 1
+    env = Environment(loader=FileSystemLoader(script_path))
+    result = env.get_template("benchmark_template.cpp").render(f_size=f_sizes.pop(),
+                                                               code=backend(function_body),
+                                                               target=target)
+    with open(target_file, 'w') as f:
+        f.write(result)
+def pacxx_compile(source, executable, options=DEFAULT_PAXX_COMPILE_OPTIONS):
+    command = ['pacxx++', *options, source, '-o', executable, ]
+    env = os.environ.copy()
+    env['PATH'] = "{}:{}".format(env.get('PATH', ''), os.path.join(PAXX_ROOT, 'bin'))
+    env['LD_LIBRARY_PATH'] = "{}:{}".format(env.get('LD_LIBRARY_PATH', ''), os.path.join(PAXX_ROOT, 'lib'))
+    try:
+        subprocess.check_output(command, env=env, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        print(" ".join(command))
+        print(e.output.decode('utf8'))
+        raise e
+def run_paxx_benchmark(executable, domain_size, iterations):
+    assert len(domain_size) == 3
+    arguments = [executable, *domain_size, iterations]
+    arguments = [str(e) for e in arguments]
+    output = subprocess.check_output(arguments)
+    return float(output) / iterations
+def paxx_benchmark(ast, domain_size, iterations, target='cpu', compile_options=DEFAULT_PAXX_COMPILE_OPTIONS):
+    """Generates,  compiles and runs the kernel with PAXX
+    Args:
+        ast: pystencils AST object (has to be generated for CUDA, even when run on CPU with pacxx)
+        domain_size: x, y, z extent of spatial domain
+        iterations: number of outer iterations
+        target: either 'cpu' or 'gpu' to specify where pacxx should run the kernel
+        compile_options: compile options for pacxx
+    Returns:
+        seconds for one outer iteration
+    """
+    with TemporaryDirectory() as base_dir:
+        code = os.path.join(base_dir, 'code.cpp')
+        executable = os.path.join(base_dir, 'bench')
+        generate_benchmark_code(code, ast, target)
+        pacxx_compile(code, executable, compile_options)
+        time_per_iteration = run_paxx_benchmark(executable, domain_size, iterations)
+    return time_per_iteration
+def lbm_performance_compare(domain_size, iterations, **lb_params):
+    """Runs benchmark with pacxx and with normal pystencils backends.
+    Args:
+        domain_size: 3-tuple with size of spatial domain
+        iterations: number of outer iterations
+        **lb_params: parameters passed to lbmpy to choose lattice Boltzmann algorithm & optimization options
+    Returns:
+        dictionary with measurements of time per iteration for different backends
+    """
+    import pycuda.driver as drv
+    from lbmpy.creationfunctions import create_lb_ast
+    if 'optimization' not in lb_params:
+        lb_params['optimization'] = {}
+    lb_params['optimization']['target'] = 'cpu'
+    cpu_ast = create_lb_ast(**lb_params)
+    lb_params['optimization']['target'] = 'gpu'
+    gpu_ast = create_lb_ast(**lb_params)
+    # print kernel code of CPU or GPU version - just for comparison, files are not used
+    with open("pystencils_cpu_code.c", 'w') as f:
+        print(generate_c(cpu_ast), file=f)
+    with open("pystencils_gpu_code.cu", 'w') as f:
+        print(generate_c(gpu_ast), file=f)
+    cpu_kernel = cpu_ast.compile()
+    gpu_kernel = gpu_ast.compile()
+    f_sizes = {f.shape[-1] for f in cpu_ast.fields_accessed}
+    assert len(f_sizes) == 1
+    f_size = f_sizes.pop()
+    dh = create_data_handling(domain_size, default_target='gpu', default_layout='fzyx')
+    dh.add_array('src', values_per_cell=f_size)
+    dh.add_array('dst', values_per_cell=f_size)
+    dh.fill('src', 0)
+    dh.fill('dst', 0)
+    # to keep it simple we run outer loop directly from Python
+    # make domain size large enough, otherwise we measure the python call overhead
+    def run_benchmark(kernel):
+        dh.all_to_gpu()
+        for i in range(10):  # warmup
+            dh.run_kernel(kernel)
+        drv.Context.synchronize()
+        start = perf_counter()
+        for i in range(iterations):
+            dh.run_kernel(kernel)
+        drv.Context.synchronize()
+        return (perf_counter() - start) / iterations
+    return {
+        'pystencils_cpu': run_benchmark(cpu_kernel),
+        'pystencils_gpu': run_benchmark(gpu_kernel),
+        'pacxx_cpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='cpu'),
+        'pacxx_gpu': paxx_benchmark(gpu_ast, domain_size, iterations, target='gpu'),
+    }
+if __name__ == '__main__':
+    no_opt = {
+        'openmp': 8,  # number of threads - pacxx uses also HT cores
+        'split': False,
+        'vectorization': False,
+        'gpu_indexing_params': {'block_size': (64, 8, 1)},
+    }
+    only_vectorization = {
+        'openmp': 4,
+        'split': False,
+        'gpu_indexing_params': {'block_size': (64, 8, 1)},
+        'vectorization': {'instruction_set': 'avx',
+                          'assume_inner_stride_one': True,
+                          'nontemporal': False},
+    }
+    best = {
+        'openmp': 4,
+        'split': True,
+        'gpu_indexing_params': {'block_size': (64, 8, 1)},
+        'vectorization': {'instruction_set': 'avx',
+                          'assume_inner_stride_one': True,
+                          'nontemporal': True}
+    }
+    res = lbm_performance_compare(stencil='D3Q19', relaxation_rate=1.8, compressible=False,
+                                  domain_size=(512, 128, 32), iterations=500,
+                                  optimization=only_vectorization)
+    cpu_speedup = ((res['pacxx_cpu'] / res['pystencils_cpu']) - 1) * 100
+    gpu_speedup = ((res['pacxx_gpu'] / res['pystencils_gpu']) - 1) * 100
+    print("Time for one kernel call [s]")
+    for config_name, time in res.items():
+        print("  {0: <16}: {1}".format(config_name, time))
+    print("CPU {:.02f}%   GPU {:.02f}%".format(cpu_speedup, gpu_speedup))
--- a/pacxx/benchmark_template.cpp
+++ b/pacxx/benchmark_template.cpp
+#include <PACXX.h>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <chrono>
+using namespace pacxx::v2;
+size_t division_round_up(size_t a, size_t b)
+{
+    if( a % b == 0)
+        return a / b;
+    else
+        return (a / b) + 1;
+}
+int main(int argc, char** argv)
+{
+    {% if target == 'cpu' %}
+    Executor::Create<NativeRuntime>(0);
+    {% elif target == 'gpu' %}
+    Executor::Create<CUDARuntime>(0);
+    {% endif %}
+    if( argc != 5 ) {
+        std::cout << "Usage:  ./benchmark xSize ySize zSize iterations" << std::endl;
+        return 1;
+    }
+    Dimension3 domainSize;
+    int64_t iterations;
+    auto &exec = Executor::get(0);
+    std::stringstream( argv[1] ) >> domainSize.x;
+    std::stringstream( argv[2] ) >> domainSize.y;
+    std::stringstream( argv[3] ) >> domainSize.z;
+    std::stringstream( argv[4] ) >> iterations;
+    // add ghost layers to be comparable to pystencils native backend
+    domainSize.x += 2;
+    domainSize.y += 2;
+    domainSize.z += 2;
+    int64_t totalSize = domainSize.x * domainSize.y * domainSize.z * {{f_size}};
+    std::vector<double> src( totalSize, 0.0 );
+    std::vector<double> dst( totalSize, 0.0 );
+    auto & dsrc = exec.allocate<double>(src.size());
+    auto & ddst = exec.allocate<double>(dst.size());
+    dsrc.upload(src.data(), src.size());
+    ddst.upload(dst.data(), dst.size());
+    double * _data_src = dsrc.get();
+    double * _data_dst = ddst.get();
+    const int64_t _size_src_0 = domainSize.x;
+    const int64_t _size_src_1 = domainSize.y;
+    const int64_t _size_src_2 = domainSize.z;
+    // fzyx layout
+    const int64_t _stride_src_0 = 1;
+    const int64_t _stride_src_1 = domainSize.x;
+    const int64_t _stride_src_2 = domainSize.x * domainSize.y;
+    const int64_t _stride_src_3 = domainSize.x * domainSize.y * domainSize.z;
+    auto pacxxKernel = [=]( range & config ) {
+        struct Vec3D {int x; int y; int z; };
+        const Vec3D blockDim  = { config.get_block_size(0), config.get_block_size(1), config.get_block_size(2) };
+        const Vec3D blockIdx  = { config.get_block(0), config.get_block(1), config.get_block(2) };
+        const Vec3D threadIdx = { config.get_local(0), config.get_local(1), config.get_local(2) };
+        {{ code|indent(8) }}
+    };
+    size_t blockSize[] = {64, 8, 1};
+    KernelConfiguration config( { division_round_up(domainSize.x - 2, blockSize[0]),
+                                  division_round_up(domainSize.y - 2, blockSize[1]),
+                                  division_round_up(domainSize.z  -2, blockSize[2]) },
+                                  { blockSize[0],
+                                    blockSize[1],
+                                    blockSize[2] });
+    // warm up
+    for( int64_t i = 0; i < 10; ++i ) {
+        exec.launch(pacxxKernel, config);
+    }
+    exec.synchronize();
+    auto start = std::chrono::high_resolution_clock::now();
+    for( int64_t i = 0; i < iterations; ++i ) {
+        exec.launch(pacxxKernel, config);
+    }
+    exec.synchronize();
+    auto duration = std::chrono::high_resolution_clock::now() - start;
+    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration);
+    std::cout << ns.count() * 1e-9 << std::endl;
+}