Skip to content
Snippets Groups Projects
Commit 92c29b18 authored by Markus Holzer's avatar Markus Holzer
Browse files

Added CUDA benchmarks

parent e2f42187
No related branches found
No related tags found
1 merge request!1Add CUDA support
Pipeline #37789 skipped
from .enums import Compiler
from .benchmark import generate_benchmark, kernel_header, kernel_source
from .benchmark_gpu import generate_benchmark_gpu
......@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction, PragmaBlock
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
......@@ -147,7 +147,7 @@ def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True):
'timing': timing,
}
main = _env.get_template('main.c').render(**jinja_context)
main = _env.get_template('cpu/main.c').render(**jinja_context)
return main
......@@ -160,7 +160,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
'function_signature': function_signature,
}
header = _env.get_template('kernel.h').render(**jinja_context)
header = _env.get_template('cpu/kernel.h').render(**jinja_context)
return header
......@@ -176,5 +176,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
'timing': True,
}
source = _env.get_template('kernel.c').render(**jinja_context)
source = _env.get_template('cpu/kernel.c').render(**jinja_context)
return source
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_shape
from pystencils.gpucuda import BlockIndexing
from pystencils_benchmark.enums import Compiler
_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
trim_blocks=True, lstrip_blocks=True)
def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
compiler: Compiler = Compiler.GCC) -> None:
if path is None:
path = Path('.')
else:
path.mkdir(parents=True, exist_ok=True)
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
for kernel_ast in kernel_asts:
kernel_name = kernel_ast.function_name
header = kernel_header(kernel_ast)
with open(include_path / f'{kernel_name}.h', 'w+') as f:
f.write(header)
source = kernel_source(kernel_ast)
with open(src_path / f'{kernel_name}.c', 'w+') as f:
f.write(source)
with open(src_path / 'main.c', 'w+') as f:
f.write(kernel_main(kernel_asts))
copy_static_files(path)
compiler_toolchain(path, compiler)
def compiler_toolchain(path: Path, compiler: Compiler) -> None:
name = compiler.name
jinja_context = {
'compiler': name,
}
files = ['Makefile', f'{name}.mk']
for file_name in files:
with open(path / file_name, 'w+') as f:
template = _env.get_template(file_name).render(**jinja_context)
f.write(template)
def copy_static_files(path: Path) -> None:
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
files = ['timing.h', 'timing.c']
for file_name in files:
template = _env.get_template(file_name).render()
if file_name[-1] == 'h':
target_path = include_path / file_name
elif file_name[-1] == 'c':
target_path = src_path / file_name
else:
target_path = path / file_name
with open(target_path, 'w+') as f:
f.write(template)
def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
"""
Return C code of a benchmark program for the given kernel.
Args:
kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
timing: add timing output to the code, prints time per iteration to stdout
cuda_block_size: defines the cuda block grid
Returns:
C code as string
"""
Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
'call_argument_list', 'blocks', 'grid'])
kernels = []
includes = set()
for kernel in kernels_ast:
name = kernel.function_name
accessed_fields = {f.name: f for f in kernel.fields_accessed}
constants = []
fields = []
call_parameters = []
block_and_thread_numbers = dict()
for p in kernel.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
elements = prod(field.shape)
fields.append((p.field_name, dtype, elements))
call_parameters.append(p.field_name)
common_shape = get_common_shape(kernel.fields_accessed)
indexing = kernel.indexing
block_and_thread_numbers = indexing.call_parameters(common_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
call_argument_list=",".join(call_parameters),
blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
includes.add(name)
jinja_context = {
'kernels': kernels,
'includes': includes,
'timing': timing,
}
main = _env.get_template('gpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
header_guard = f'_{kernel_ast.function_name.upper()}_H'
jinja_context = {
'header_guard': header_guard,
'function_signature': function_signature,
'target': 'gpu'
}
header = _env.get_template('gpu/kernel.h').render(**jinja_context)
return header
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
kernel_name = kernel_ast.function_name
function_source = generate_c(kernel_ast, dialect=dialect)
headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
headers.update(get_headers(kernel_ast))
jinja_context = {
'function_source': function_source,
'headers': sorted(headers),
'timing': True,
'target': 'gpu'
}
source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
return source
......@@ -6,3 +6,4 @@ class Compiler(Enum):
GCCdebug = auto()
Clang = auto()
ICC = auto()
NVCC = auto()
CC = nvcc
LINKER = $(CC)
# More warning pls
#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
# Maybe too much warnings
#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
# Specific C flags
CFLAGS := -use_fast_math
DEFINES = -D_GNU_SOURCE -DNDEBUG
INCLUDES =
LIBS =
{% for header in headers %}
#include {{header}}
{% endfor %}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_source}}
#ifndef {{header_guard}}
#define {{header_guard}}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_signature}};
#endif
#include <assert.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "timing.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
#include <cuda_runtime.h>
//kernels
{% for include in includes %}
#include "{{ include }}.h"
{% endfor %}
int main(int argc, char **argv)
{
if(argc < 2) {
printf("Usage: %s <n_repeat>\n", argv[0]);
return -1;
}
int n_repeat = atoi(argv[1]);
{% for kernel in kernels %}
{ // Kernel: {{kernel.name}}
{% for field_name, dataType, elements in kernel.fields %}
{{dataType}} *{{field_name}};
cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
cudaMemset({{field_name}}, 0.23, {{elements}});
{% endfor %}
dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = n_repeat;
}
{% if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{% endif %}
for (; repeat > 0; --repeat)
{
{{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
}
{% if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
{% endif %}
}
{% for field_name, dataType, elements in kernel.fields %}
cudaFree({{field_name}});
{% endfor %}
}
{% endfor %}
}
......@@ -6,7 +6,7 @@ import tempfile
import pytest
import pystencils as ps
from pathlib import Path
from pystencils_benchmark import generate_benchmark, Compiler
from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
......@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg):
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
def test_generate_gpu():
compiler = Compiler.NVCC
a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
@ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
def vadd():
a[0] @= b[0] + c[0]
kernel_vadd = ps.create_kernel(**vadd)
temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment