Skip to content
Snippets Groups Projects
Commit c51eee43 authored by Markus Holzer's avatar Markus Holzer Committed by Christoph Alt
Browse files

Added CUDA benchmarks

parent ae05d616
No related branches found
No related tags found
1 merge request!1Add CUDA support
from .enums import Compiler from .enums import Compiler
from .benchmark import generate_benchmark, kernel_header, kernel_source from .benchmark import generate_benchmark, kernel_header, kernel_source
from .benchmark_gpu import generate_benchmark_gpu
...@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined ...@@ -6,7 +6,7 @@ from jinja2 import Environment, PackageLoader, StrictUndefined
import numpy as np import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction, PragmaBlock from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend from pystencils.enums import Backend
from pystencils.typing import get_base_type from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod from pystencils.sympyextensions import prod
...@@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *, ...@@ -159,7 +159,7 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
'likwid': likwid, 'likwid': likwid,
} }
main = _env.get_template('main.c').render(**jinja_context) main = _env.get_template('cpu/main.c').render(**jinja_context)
return main return main
...@@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s ...@@ -172,7 +172,7 @@ def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
'function_signature': function_signature, 'function_signature': function_signature,
} }
header = _env.get_template('kernel.h').render(**jinja_context) header = _env.get_template('cpu/kernel.h').render(**jinja_context)
return header return header
...@@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s ...@@ -188,5 +188,5 @@ def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> s
'timing': True, 'timing': True,
} }
source = _env.get_template('kernel.c').render(**jinja_context) source = _env.get_template('cpu/kernel.c').render(**jinja_context)
return source return source
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_shape
from pystencils.gpucuda import BlockIndexing
from pystencils_benchmark.enums import Compiler
_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
trim_blocks=True, lstrip_blocks=True)
def generate_benchmark_gpu(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
compiler: Compiler = Compiler.GCC) -> None:
if path is None:
path = Path('.')
else:
path.mkdir(parents=True, exist_ok=True)
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
for kernel_ast in kernel_asts:
kernel_name = kernel_ast.function_name
header = kernel_header(kernel_ast)
with open(include_path / f'{kernel_name}.h', 'w+') as f:
f.write(header)
source = kernel_source(kernel_ast)
with open(src_path / f'{kernel_name}.c', 'w+') as f:
f.write(source)
with open(src_path / 'main.c', 'w+') as f:
f.write(kernel_main(kernel_asts))
copy_static_files(path)
compiler_toolchain(path, compiler)
def compiler_toolchain(path: Path, compiler: Compiler) -> None:
name = compiler.name
jinja_context = {
'compiler': name,
}
files = ['Makefile', f'{name}.mk']
for file_name in files:
with open(path / file_name, 'w+') as f:
template = _env.get_template(file_name).render(**jinja_context)
f.write(template)
def copy_static_files(path: Path) -> None:
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
files = ['timing.h', 'timing.c']
for file_name in files:
template = _env.get_template(file_name).render()
if file_name[-1] == 'h':
target_path = include_path / file_name
elif file_name[-1] == 'c':
target_path = src_path / file_name
else:
target_path = path / file_name
with open(target_path, 'w+') as f:
f.write(template)
def kernel_main(kernels_ast: List[KernelFunction], timing: bool = True, cuda_block_size: tuple = (32, 1, 1)):
"""
Return C code of a benchmark program for the given kernel.
Args:
kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
timing: add timing output to the code, prints time per iteration to stdout
cuda_block_size: defines the cuda block grid
Returns:
C code as string
"""
Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
'call_argument_list', 'blocks', 'grid'])
kernels = []
includes = set()
for kernel in kernels_ast:
name = kernel.function_name
accessed_fields = {f.name: f for f in kernel.fields_accessed}
constants = []
fields = []
call_parameters = []
block_and_thread_numbers = dict()
for p in kernel.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
elements = prod(field.shape)
fields.append((p.field_name, dtype, elements))
call_parameters.append(p.field_name)
common_shape = get_common_shape(kernel.fields_accessed)
indexing = kernel.indexing
block_and_thread_numbers = indexing.call_parameters(common_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
call_argument_list=",".join(call_parameters),
blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
includes.add(name)
jinja_context = {
'kernels': kernels,
'includes': includes,
'timing': timing,
}
main = _env.get_template('gpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
header_guard = f'_{kernel_ast.function_name.upper()}_H'
jinja_context = {
'header_guard': header_guard,
'function_signature': function_signature,
'target': 'gpu'
}
header = _env.get_template('gpu/kernel.h').render(**jinja_context)
return header
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
kernel_name = kernel_ast.function_name
function_source = generate_c(kernel_ast, dialect=dialect)
headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
headers.update(get_headers(kernel_ast))
jinja_context = {
'function_source': function_source,
'headers': sorted(headers),
'timing': True,
'target': 'gpu'
}
source = _env.get_template('gpu/kernel.cu').render(**jinja_context)
return source
...@@ -6,3 +6,4 @@ class Compiler(Enum): ...@@ -6,3 +6,4 @@ class Compiler(Enum):
GCCdebug = auto() GCCdebug = auto()
Clang = auto() Clang = auto()
ICC = auto() ICC = auto()
NVCC = auto()
CC = nvcc
LINKER = $(CC)
# More warning pls
#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
# Maybe too much warnings
#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
# Specific C flags
CFLAGS := -use_fast_math
DEFINES = -D_GNU_SOURCE -DNDEBUG
INCLUDES =
LIBS =
{% for header in headers %}
#include {{header}}
{% endfor %}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_source}}
#ifndef {{header_guard}}
#define {{header_guard}}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_signature}};
#endif
#include <assert.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "timing.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
#include <cuda_runtime.h>
//kernels
{% for include in includes %}
#include "{{ include }}.h"
{% endfor %}
int main(int argc, char **argv)
{
if(argc < 2) {
printf("Usage: %s <n_repeat>\n", argv[0]);
return -1;
}
int n_repeat = atoi(argv[1]);
{% for kernel in kernels %}
{ // Kernel: {{kernel.name}}
{% for field_name, dataType, elements in kernel.fields %}
{{dataType}} *{{field_name}};
cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
cudaMemset({{field_name}}, 0.23, {{elements}});
{% endfor %}
dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = n_repeat;
}
{% if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{% endif %}
for (; repeat > 0; --repeat)
{
{{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
}
{% if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
{% endif %}
}
{% for field_name, dataType, elements in kernel.fields %}
cudaFree({{field_name}});
{% endfor %}
}
{% endfor %}
}
...@@ -6,7 +6,7 @@ import tempfile ...@@ -6,7 +6,7 @@ import tempfile
import pytest import pytest
import pystencils as ps import pystencils as ps
from pathlib import Path from pathlib import Path
from pystencils_benchmark import generate_benchmark, Compiler from pystencils_benchmark import generate_benchmark, Compiler, generate_benchmark_gpu
compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
...@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg): ...@@ -39,3 +39,18 @@ def test_generate(compiler, config_kwarg):
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
def test_generate_gpu():
compiler = Compiler.NVCC
a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
@ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
def vadd():
a[0] @= b[0] + c[0]
kernel_vadd = ps.create_kernel(**vadd)
temp_dir = Path('/home/markus/pystencils_benchmark_testfolder')
generate_benchmark_gpu([kernel_vadd], temp_dir, compiler=compiler)
subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment