Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (2)
Showing
with 416 additions and 77 deletions
......@@ -26,6 +26,7 @@ test:
image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark
tags:
- docker
- cuda
script:
- pip install tox
- echo $TOX_ENV
......
FROM ubuntu:latest
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
LABEL maintainer="jan.hoenig@fau.de"
LABEL version="0.1"
LABEL version="0.2"
LABEL description="Custom docker image for pystencils-benchmark"
ARG DEBIAN_FRONTEND=noninteractive
......
include README.md
include LICENSE.md
include pystencils_benchmark/templates/*
include pystencils_benchmark/templates/cpu/*
include pystencils_benchmark/templates/gpu/*
from .enums import Compiler
from .benchmark import generate_benchmark, kernel_header, kernel_source
from . import gpu
from . import cpu
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils_benchmark.enums import Compiler
from pathlib import Path
_env = Environment(loader=PackageLoader('pystencils_benchmark'),
undefined=StrictUndefined,
keep_trailing_newline=True,
trim_blocks=True, lstrip_blocks=True)
def _kernel_header(kernel_ast: KernelFunction,
dialect: Backend = Backend.C,
*,
template_file: str,
additional_jinja_context: dict = None) -> str:
function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
header_guard = f'_{kernel_ast.function_name.upper()}_H'
jinja_context = {
'header_guard': header_guard,
'function_signature': function_signature,
}
if additional_jinja_context is not None:
jinja_context.update(additional_jinja_context)
header = _env.get_template(template_file).render(**jinja_context)
return header
def _kernel_source(kernel_ast: KernelFunction,
dialect: Backend = Backend.C,
*,
template_file: str,
additional_jinja_context: dict = None) -> str:
kernel_name = kernel_ast.function_name
function_source = generate_c(kernel_ast, dialect=dialect)
headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
headers.update(get_headers(kernel_ast))
jinja_context = {
'function_source': function_source,
'headers': sorted(headers),
'timing': True,
}
if additional_jinja_context is not None:
jinja_context.update(additional_jinja_context)
source = _env.get_template(template_file).render(**jinja_context)
return source
def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
name = compiler.name
jinja_context = {
'compiler': name,
'likwid': likwid,
}
files = ['Makefile', f'{name}.mk']
for file_name in files:
with open(path / file_name, 'w+') as f:
template = _env.get_template(file_name).render(**jinja_context)
f.write(template)
def copy_static_files(path: Path, *, source_file_suffix='.c') -> None:
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
files = ['timing.h', 'timing.c']
for file_name in files:
template = _env.get_template(file_name).render()
if file_name[-1] == 'h':
target_path = include_path / file_name
elif file_name[-1] == 'c':
target_path = src_path / file_name
target_path = target_path.with_suffix(source_file_suffix)
else:
target_path = path / file_name
with open(target_path, 'w+') as f:
f.write(template)
def setup_directories(path: Path):
if path is None:
path = Path('.')
else:
path.mkdir(parents=True, exist_ok=True)
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
return src_path, include_path
from .benchmark import generate_benchmark
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.astnodes import KernelFunction, PragmaBlock
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.integer_functions import modulo_ceil
from pystencils_benchmark.common import (_env,
_kernel_source,
_kernel_header,
compiler_toolchain,
copy_static_files,
setup_directories)
from pystencils_benchmark.enums import Compiler
_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True,
trim_blocks=True, lstrip_blocks=True)
def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
......@@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
timing: bool = True,
likwid: bool = False
) -> None:
if path is None:
path = Path('.')
else:
path.mkdir(parents=True, exist_ok=True)
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
src_path, include_path = setup_directories(path)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
......@@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
compiler_toolchain(path, compiler, likwid)
def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None:
name = compiler.name
jinja_context = {
'compiler': name,
'likwid': likwid,
}
files = ['Makefile', f'{name}.mk']
for file_name in files:
with open(path / file_name, 'w+') as f:
template = _env.get_template(file_name).render(**jinja_context)
f.write(template)
def copy_static_files(path: Path) -> None:
src_path = path / 'src'
src_path.mkdir(parents=True, exist_ok=True)
include_path = path / 'include'
include_path.mkdir(parents=True, exist_ok=True)
files = ['timing.h', 'timing.c']
for file_name in files:
template = _env.get_template(file_name).render()
if file_name[-1] == 'h':
target_path = include_path / file_name
elif file_name[-1] == 'c':
target_path = src_path / file_name
else:
target_path = path / file_name
with open(target_path, 'w+') as f:
f.write(template)
def kernel_main(kernels_ast: List[KernelFunction], *,
timing: bool = True, likwid: bool = False) -> str:
"""
......@@ -159,34 +120,13 @@ def kernel_main(kernels_ast: List[KernelFunction], *,
'likwid': likwid,
}
main = _env.get_template('main.c').render(**jinja_context)
main = _env.get_template('cpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True)
header_guard = f'_{kernel_ast.function_name.upper()}_H'
jinja_context = {
'header_guard': header_guard,
'function_signature': function_signature,
}
header = _env.get_template('kernel.h').render(**jinja_context)
return header
return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h')
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
kernel_name = kernel_ast.function_name
function_source = generate_c(kernel_ast, dialect=dialect)
headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'}
headers.update(get_headers(kernel_ast))
jinja_context = {
'function_source': function_source,
'headers': sorted(headers),
'timing': True,
}
source = _env.get_template('kernel.c').render(**jinja_context)
return source
return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c')
......@@ -6,3 +6,4 @@ class Compiler(Enum):
GCCdebug = auto()
Clang = auto()
ICC = auto()
NVCC = auto()
from .benchmark import generate_benchmark
from typing import Union, List
from collections import namedtuple
from pathlib import Path
from pystencils.astnodes import KernelFunction
from pystencils.enums import Backend
from pystencils.typing import get_base_type
from pystencils.sympyextensions import prod
from pystencils.transformations import get_common_field
from pystencils_benchmark.common import (_env,
_kernel_source,
_kernel_header,
compiler_toolchain,
copy_static_files,
setup_directories)
from pystencils_benchmark.enums import Compiler
def _add_launch_bound(code: str, launch_bounds: tuple) -> str:
lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) "
splitted = code.split("void ")
prefix = splitted[0]
if code.startswith("void "):
# just in case that there is nothing before the first void
prefix = ""
return prefix + "void " + lb_str + "void ".join(splitted[1:])
def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]],
path: Path = None,
*,
compiler: Compiler = Compiler.NVCC,
timing: bool = True,
launch_bounds: tuple = None,
) -> None:
src_path, include_path = setup_directories(path)
if isinstance(kernel_asts, KernelFunction):
kernel_asts = [kernel_asts]
for kernel_ast in kernel_asts:
kernel_name = kernel_ast.function_name
header = kernel_header(kernel_ast)
if launch_bounds:
header = _add_launch_bound(header, launch_bounds)
with open(include_path / f'{kernel_name}.h', 'w+') as f:
f.write(header)
source = kernel_source(kernel_ast)
if launch_bounds:
source = _add_launch_bound(source, launch_bounds)
# TODO CUDA specific suffix
with open(src_path / f'{kernel_name}.cu', 'w+') as f:
f.write(source)
with open(src_path / 'main.cu', 'w+') as f:
f.write(kernel_main(kernel_asts, timing=timing))
copy_static_files(path, source_file_suffix='.cu')
compiler_toolchain(path, compiler, likwid=False)
def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True):
"""
Return C code of a benchmark program for the given kernel.
Args:
kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters',
'call_argument_list', 'blocks', 'grid'])
kernels = []
includes = set()
for kernel in kernels_ast:
name = kernel.function_name
accessed_fields = {f.name: f for f in kernel.fields_accessed}
constants = []
fields = []
call_parameters = []
block_and_thread_numbers = dict()
for p in kernel.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
elements = prod(field.shape)
fields.append((p.field_name, dtype, elements))
call_parameters.append(p.field_name)
common_shape = get_common_field(kernel.fields_accessed).shape
indexing = kernel.indexing
block_and_thread_numbers = indexing.call_parameters(common_shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters,
call_argument_list=",".join(call_parameters),
blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid']))
includes.add(name)
jinja_context = {
'kernels': kernels,
'includes': includes,
'timing': timing,
}
main = _env.get_template('gpu/main.c').render(**jinja_context)
return main
def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
return _kernel_header(kernel_ast,
dialect=dialect,
template_file='gpu/kernel.h',
additional_jinja_context={'target': 'gpu'})
def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str:
return _kernel_source(kernel_ast,
dialect=dialect,
template_file='gpu/kernel.cu',
additional_jinja_context={'target': 'gpu'})
......@@ -29,6 +29,8 @@ LIBS += -llikwid
VPATH = $(SRC_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
# TODO CUDA specific SUFFIX
OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES)
......@@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(CFLAGS) $< -o $@
$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
# TODO CUDA specific SUFFIX
$(BUILD_DIR)/%.o: %.cu
@echo "===> COMPILE $@"
$(Q)$(CC) -c $(CFLAGS) $< -o $@
$(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.s: %.c
@echo "===> GENERATE ASM $@"
$(Q)$(CC) -S $(CFLAGS) $< -o $@
......
CC = nvcc
LINKER = $(CC)
# More warning pls
#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return
# Maybe too much warnings
#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code
# Specific C flags
CFLAGS := -use_fast_math
DEFINES = -D_GNU_SOURCE -DNDEBUG
INCLUDES =
LIBS =
{% for header in headers %}
#include {{header}}
{% endfor %}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_source}}
#ifndef {{header_guard}}
#define {{header_guard}}
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
{{function_signature}};
#endif
#include <assert.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "timing.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX __global__
#include <cuda_runtime.h>
//kernels
{% for include in includes %}
#include "{{ include }}.h"
{% endfor %}
int main(int argc, char **argv)
{
if(argc < 2) {
printf("Usage: %s <n_repeat>\n", argv[0]);
return -1;
}
int n_repeat = atoi(argv[1]);
{% for kernel in kernels %}
{ // Kernel: {{kernel.name}}
{% for field_name, dataType, elements in kernel.fields %}
{{dataType}} *{{field_name}};
cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}}));
cudaMemset({{field_name}}, 0.23, {{elements}});
{% endfor %}
{% for constantName, dataType in kernel.constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
{% endfor %}
dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}});
dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}});
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = n_repeat;
}
{% if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{% endif %}
for (; repeat > 0; --repeat)
{
{{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}});
}
{% if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat );
{% endif %}
}
{% for field_name, dataType, elements in kernel.fields %}
cudaFree({{field_name}});
{% endfor %}
}
{% endfor %}
}
......@@ -10,7 +10,10 @@ license = AGPLv3
version = 0.0.1
[options]
packages = pystencils_benchmark
packages =
pystencils_benchmark
pystencils_benchmark.gpu
pystencils_benchmark.cpu
install_requires =
jinja2 >= 3.0
pystencils >= 0.3.4
......
......@@ -6,7 +6,9 @@ import tempfile
import pytest
import pystencils as ps
from pathlib import Path
from pystencils_benchmark import generate_benchmark, Compiler
from pystencils_benchmark import Compiler
import pystencils_benchmark as pb
compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang)
......@@ -16,6 +18,20 @@ config_kwargs = ({},
'assume_aligned': True}})
def nvidia_gpu_available():
try:
return subprocess.call(['nvidia-smi']) == 0
except (FileNotFoundError,):
return False
def nvcc_available():
try:
return subprocess.call(['nvcc', '--version']) == 0
except (FileNotFoundError,):
return False
@pytest.mark.parametrize('compiler', compilers)
@pytest.mark.parametrize('config_kwarg', config_kwargs)
def test_generate(compiler, config_kwarg):
......@@ -34,8 +50,36 @@ def test_generate(compiler, config_kwarg):
with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
temp_dir = Path(temp_dir)
generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler)
subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)
gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)})
@pytest.mark.parametrize('kwargs', gpu_kwargs)
def test_generate_gpu(kwargs):
compiler = Compiler.NVCC
a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000))
alpha = sp.symbols('alpha')
@ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
def vadd():
a[0] @= b[0] + c[0]
kernel_vadd = ps.create_kernel(**vadd)
@ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU))
def daxpy():
b[0] @= alpha * a[0] + b[0]
kernel_daxpy = ps.create_kernel(**daxpy)
with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir:
temp_dir = Path(temp_dir)
pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs)
if not nvcc_available():
pytest.skip("nvcc is not available!")
subprocess.run(['make', '-C', f'{temp_dir}'], check=True)
if not nvidia_gpu_available():
pytest.skip("There is no GPU available!")
subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True)