From 51353a2bdf95069ce3a15a6de8d27cc208ad8992 Mon Sep 17 00:00:00 2001 From: Markus Holzer <markus.holzer@fau.de> Date: Wed, 18 Sep 2024 13:53:32 +0200 Subject: [PATCH] Add CUDA support --- .gitlab-ci.yml | 1 + Dockerfile | 4 +- MANIFEST.in | 2 + pystencils_benchmark/__init__.py | 3 +- pystencils_benchmark/common.py | 102 ++++++++++++++ pystencils_benchmark/cpu/__init__.py | 1 + pystencils_benchmark/{ => cpu}/benchmark.py | 82 ++--------- pystencils_benchmark/enums.py | 1 + pystencils_benchmark/gpu/__init__.py | 1 + pystencils_benchmark/gpu/benchmark.py | 132 ++++++++++++++++++ pystencils_benchmark/templates/Makefile | 8 ++ pystencils_benchmark/templates/NVCC.mk | 12 ++ .../templates/{ => cpu}/kernel.c | 0 .../templates/{ => cpu}/kernel.h | 0 .../templates/{ => cpu}/main.c | 0 pystencils_benchmark/templates/gpu/kernel.cu | 8 ++ pystencils_benchmark/templates/gpu/kernel.h | 11 ++ pystencils_benchmark/templates/gpu/main.c | 72 ++++++++++ setup.cfg | 5 +- tests/test_benchmark.py | 48 ++++++- tests/test_launch_bounds.py | 19 +++ ve_example/test.py | 14 +- 22 files changed, 442 insertions(+), 84 deletions(-) create mode 100644 pystencils_benchmark/common.py create mode 100644 pystencils_benchmark/cpu/__init__.py rename pystencils_benchmark/{ => cpu}/benchmark.py (65%) create mode 100644 pystencils_benchmark/gpu/__init__.py create mode 100644 pystencils_benchmark/gpu/benchmark.py create mode 100644 pystencils_benchmark/templates/NVCC.mk rename pystencils_benchmark/templates/{ => cpu}/kernel.c (100%) rename pystencils_benchmark/templates/{ => cpu}/kernel.h (100%) rename pystencils_benchmark/templates/{ => cpu}/main.c (100%) create mode 100644 pystencils_benchmark/templates/gpu/kernel.cu create mode 100644 pystencils_benchmark/templates/gpu/kernel.h create mode 100644 pystencils_benchmark/templates/gpu/main.c create mode 100644 tests/test_launch_bounds.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 65e62e3..7a159bd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,6 +26,7 @@ test: image: i10git.cs.fau.de:5005/pycodegen/pystencils-benchmark/pystencils-benchmark tags: - docker + - cuda script: - pip install tox - echo $TOX_ENV diff --git a/Dockerfile b/Dockerfile index 2b8f8a8..dcb5493 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ -FROM ubuntu:latest +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 LABEL maintainer="jan.hoenig@fau.de" -LABEL version="0.1" +LABEL version="0.2" LABEL description="Custom docker image for pystencils-benchmark" ARG DEBIAN_FRONTEND=noninteractive diff --git a/MANIFEST.in b/MANIFEST.in index ef395db..38e6285 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,5 @@ include README.md include LICENSE.md include pystencils_benchmark/templates/* +include pystencils_benchmark/templates/cpu/* +include pystencils_benchmark/templates/gpu/* diff --git a/pystencils_benchmark/__init__.py b/pystencils_benchmark/__init__.py index 6f5f32c..86d8f56 100644 --- a/pystencils_benchmark/__init__.py +++ b/pystencils_benchmark/__init__.py @@ -1,2 +1,3 @@ from .enums import Compiler -from .benchmark import generate_benchmark, kernel_header, kernel_source +from . import gpu +from . import cpu diff --git a/pystencils_benchmark/common.py b/pystencils_benchmark/common.py new file mode 100644 index 0000000..70cabd6 --- /dev/null +++ b/pystencils_benchmark/common.py @@ -0,0 +1,102 @@ +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from jinja2 import Environment, PackageLoader, StrictUndefined + +from pystencils_benchmark.enums import Compiler +from pathlib import Path + +_env = Environment(loader=PackageLoader('pystencils_benchmark'), + undefined=StrictUndefined, + keep_trailing_newline=True, + trim_blocks=True, lstrip_blocks=True) + + +def _kernel_header(kernel_ast: KernelFunction, + dialect: Backend = Backend.C, + *, + template_file: str, + additional_jinja_context: dict = None) -> str: + + function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) + header_guard = f'_{kernel_ast.function_name.upper()}_H' + + jinja_context = { + 'header_guard': header_guard, + 'function_signature': function_signature, + } + if additional_jinja_context is not None: + jinja_context.update(additional_jinja_context) + + header = _env.get_template(template_file).render(**jinja_context) + return header + + +def _kernel_source(kernel_ast: KernelFunction, + dialect: Backend = Backend.C, + *, + template_file: str, + additional_jinja_context: dict = None) -> str: + + kernel_name = kernel_ast.function_name + function_source = generate_c(kernel_ast, dialect=dialect) + headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} + headers.update(get_headers(kernel_ast)) + + jinja_context = { + 'function_source': function_source, + 'headers': sorted(headers), + 'timing': True, + } + + if additional_jinja_context is not None: + jinja_context.update(additional_jinja_context) + + source = _env.get_template(template_file).render(**jinja_context) + return source + + +def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None: + name = compiler.name + jinja_context = { + 'compiler': name, + 'likwid': likwid, + } + + files = ['Makefile', f'{name}.mk'] + for file_name in files: + with open(path / file_name, 'w+') as f: + template = _env.get_template(file_name).render(**jinja_context) + f.write(template) + + +def copy_static_files(path: Path, *, source_file_suffix='.c') -> None: + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + + files = ['timing.h', 'timing.c'] + for file_name in files: + template = _env.get_template(file_name).render() + if file_name[-1] == 'h': + target_path = include_path / file_name + elif file_name[-1] == 'c': + target_path = src_path / file_name + target_path = target_path.with_suffix(source_file_suffix) + else: + target_path = path / file_name + with open(target_path, 'w+') as f: + f.write(template) + + +def setup_directories(path: Path): + if path is None: + path = Path('.') + else: + path.mkdir(parents=True, exist_ok=True) + src_path = path / 'src' + src_path.mkdir(parents=True, exist_ok=True) + include_path = path / 'include' + include_path.mkdir(parents=True, exist_ok=True) + return src_path, include_path diff --git a/pystencils_benchmark/cpu/__init__.py b/pystencils_benchmark/cpu/__init__.py new file mode 100644 index 0000000..cfd889e --- /dev/null +++ b/pystencils_benchmark/cpu/__init__.py @@ -0,0 +1 @@ +from .benchmark import generate_benchmark diff --git a/pystencils_benchmark/benchmark.py b/pystencils_benchmark/cpu/benchmark.py similarity index 65% rename from pystencils_benchmark/benchmark.py rename to pystencils_benchmark/cpu/benchmark.py index df6bd9b..4258ee5 100644 --- a/pystencils_benchmark/benchmark.py +++ b/pystencils_benchmark/cpu/benchmark.py @@ -1,24 +1,24 @@ from typing import Union, List from collections import namedtuple from pathlib import Path -from jinja2 import Environment, PackageLoader, StrictUndefined import numpy as np -from pystencils.backends.cbackend import generate_c, get_headers from pystencils.astnodes import KernelFunction, PragmaBlock from pystencils.enums import Backend from pystencils.typing import get_base_type from pystencils.sympyextensions import prod from pystencils.integer_functions import modulo_ceil +from pystencils_benchmark.common import (_env, + _kernel_source, + _kernel_header, + compiler_toolchain, + copy_static_files, + setup_directories) from pystencils_benchmark.enums import Compiler -_env = Environment(loader=PackageLoader('pystencils_benchmark'), undefined=StrictUndefined, keep_trailing_newline=True, - trim_blocks=True, lstrip_blocks=True) - - def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], path: Path = None, *, @@ -26,14 +26,8 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], timing: bool = True, likwid: bool = False ) -> None: - if path is None: - path = Path('.') - else: - path.mkdir(parents=True, exist_ok=True) - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) + + src_path, include_path = setup_directories(path) if isinstance(kernel_asts, KernelFunction): kernel_asts = [kernel_asts] @@ -56,39 +50,6 @@ def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], compiler_toolchain(path, compiler, likwid) -def compiler_toolchain(path: Path, compiler: Compiler, likwid: bool) -> None: - name = compiler.name - jinja_context = { - 'compiler': name, - 'likwid': likwid, - } - - files = ['Makefile', f'{name}.mk'] - for file_name in files: - with open(path / file_name, 'w+') as f: - template = _env.get_template(file_name).render(**jinja_context) - f.write(template) - - -def copy_static_files(path: Path) -> None: - src_path = path / 'src' - src_path.mkdir(parents=True, exist_ok=True) - include_path = path / 'include' - include_path.mkdir(parents=True, exist_ok=True) - - files = ['timing.h', 'timing.c'] - for file_name in files: - template = _env.get_template(file_name).render() - if file_name[-1] == 'h': - target_path = include_path / file_name - elif file_name[-1] == 'c': - target_path = src_path / file_name - else: - target_path = path / file_name - with open(target_path, 'w+') as f: - f.write(template) - - def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True, likwid: bool = False) -> str: """ @@ -159,34 +120,13 @@ def kernel_main(kernels_ast: List[KernelFunction], *, 'likwid': likwid, } - main = _env.get_template('main.c').render(**jinja_context) + main = _env.get_template('cpu/main.c').render(**jinja_context) return main def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - function_signature = generate_c(kernel_ast, dialect=dialect, signature_only=True) - header_guard = f'_{kernel_ast.function_name.upper()}_H' - - jinja_context = { - 'header_guard': header_guard, - 'function_signature': function_signature, - } - - header = _env.get_template('kernel.h').render(**jinja_context) - return header + return _kernel_header(kernel_ast, dialect=dialect, template_file='cpu/kernel.h') def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: - kernel_name = kernel_ast.function_name - function_source = generate_c(kernel_ast, dialect=dialect) - headers = {f'"{kernel_name}.h"', '<math.h>', '<stdint.h>'} - headers.update(get_headers(kernel_ast)) - - jinja_context = { - 'function_source': function_source, - 'headers': sorted(headers), - 'timing': True, - } - - source = _env.get_template('kernel.c').render(**jinja_context) - return source + return _kernel_source(kernel_ast, dialect=dialect, template_file='cpu/kernel.c') diff --git a/pystencils_benchmark/enums.py b/pystencils_benchmark/enums.py index ec56c8a..84cf49e 100644 --- a/pystencils_benchmark/enums.py +++ b/pystencils_benchmark/enums.py @@ -6,3 +6,4 @@ class Compiler(Enum): GCCdebug = auto() Clang = auto() ICC = auto() + NVCC = auto() diff --git a/pystencils_benchmark/gpu/__init__.py b/pystencils_benchmark/gpu/__init__.py new file mode 100644 index 0000000..cfd889e --- /dev/null +++ b/pystencils_benchmark/gpu/__init__.py @@ -0,0 +1 @@ +from .benchmark import generate_benchmark diff --git a/pystencils_benchmark/gpu/benchmark.py b/pystencils_benchmark/gpu/benchmark.py new file mode 100644 index 0000000..1e9ce37 --- /dev/null +++ b/pystencils_benchmark/gpu/benchmark.py @@ -0,0 +1,132 @@ +from typing import Union, List +from collections import namedtuple +from pathlib import Path + +from pystencils.astnodes import KernelFunction +from pystencils.enums import Backend +from pystencils.typing import get_base_type +from pystencils.sympyextensions import prod +from pystencils.transformations import get_common_field + +from pystencils_benchmark.common import (_env, + _kernel_source, + _kernel_header, + compiler_toolchain, + copy_static_files, + setup_directories) +from pystencils_benchmark.enums import Compiler + + +def _add_launch_bound(code: str, launch_bounds: tuple) -> str: + lb_str = f"__launch_bounds__({', '.join(str(lb) for lb in launch_bounds)}) " + splitted = code.split("void ") + prefix = splitted[0] + if code.startswith("void "): + # just in case that there is nothing before the first void + prefix = "" + return prefix + "void " + lb_str + "void ".join(splitted[1:]) + + +def generate_benchmark(kernel_asts: Union[KernelFunction, List[KernelFunction]], + path: Path = None, + *, + compiler: Compiler = Compiler.NVCC, + timing: bool = True, + launch_bounds: tuple = None, + ) -> None: + + src_path, include_path = setup_directories(path) + + if isinstance(kernel_asts, KernelFunction): + kernel_asts = [kernel_asts] + + for kernel_ast in kernel_asts: + kernel_name = kernel_ast.function_name + + header = kernel_header(kernel_ast) + if launch_bounds: + header = _add_launch_bound(header, launch_bounds) + with open(include_path / f'{kernel_name}.h', 'w+') as f: + f.write(header) + + source = kernel_source(kernel_ast) + if launch_bounds: + source = _add_launch_bound(source, launch_bounds) + # TODO CUDA specific suffix + with open(src_path / f'{kernel_name}.cu', 'w+') as f: + f.write(source) + + with open(src_path / 'main.cu', 'w+') as f: + f.write(kernel_main(kernel_asts, timing=timing)) + + copy_static_files(path, source_file_suffix='.cu') + compiler_toolchain(path, compiler, likwid=False) + + +def kernel_main(kernels_ast: List[KernelFunction], *, timing: bool = True): + """ + Return C code of a benchmark program for the given kernel. + + Args: + kernels_ast: A list of the pystencils AST object as returned by create_kernel for benchmarking + timing: add timing output to the code, prints time per iteration to stdout + Returns: + C code as string + """ + Kernel = namedtuple('Kernel', ['name', 'constants', 'fields', 'call_parameters', + 'call_argument_list', 'blocks', 'grid']) + kernels = [] + includes = set() + for kernel in kernels_ast: + name = kernel.function_name + accessed_fields = {f.name: f for f in kernel.fields_accessed} + constants = [] + fields = [] + call_parameters = [] + block_and_thread_numbers = dict() + for p in kernel.get_parameters(): + if not p.is_field_parameter: + constants.append((p.symbol.name, str(p.symbol.dtype))) + call_parameters.append(p.symbol.name) + else: + assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" + field = accessed_fields[p.field_name] + dtype = str(get_base_type(p.symbol.dtype)) + elements = prod(field.shape) + + fields.append((p.field_name, dtype, elements)) + call_parameters.append(p.field_name) + + common_shape = get_common_field(kernel.fields_accessed).shape + indexing = kernel.indexing + block_and_thread_numbers = indexing.call_parameters(common_shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) + + kernels.append(Kernel(name=name, fields=fields, constants=constants, call_parameters=call_parameters, + call_argument_list=",".join(call_parameters), + blocks=block_and_thread_numbers['block'], grid=block_and_thread_numbers['grid'])) + includes.add(name) + + jinja_context = { + 'kernels': kernels, + 'includes': includes, + 'timing': timing, + } + + main = _env.get_template('gpu/main.c').render(**jinja_context) + return main + + +def kernel_header(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + return _kernel_header(kernel_ast, + dialect=dialect, + template_file='gpu/kernel.h', + additional_jinja_context={'target': 'gpu'}) + + +def kernel_source(kernel_ast: KernelFunction, dialect: Backend = Backend.C) -> str: + return _kernel_source(kernel_ast, + dialect=dialect, + template_file='gpu/kernel.cu', + additional_jinja_context={'target': 'gpu'}) diff --git a/pystencils_benchmark/templates/Makefile b/pystencils_benchmark/templates/Makefile index 66b68b8..d62c513 100644 --- a/pystencils_benchmark/templates/Makefile +++ b/pystencils_benchmark/templates/Makefile @@ -29,6 +29,8 @@ LIBS += -llikwid VPATH = $(SRC_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OBJ = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)) +# TODO CUDA specific SUFFIX +OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu)) CFLAGS := $(CFLAGS) $(DEFINES) $(INCLUDES) @@ -43,6 +45,12 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(CFLAGS) $< -o $@ $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d +# TODO CUDA specific SUFFIX +$(BUILD_DIR)/%.o: %.cu + @echo "===> COMPILE $@" + $(Q)$(CC) -c $(CFLAGS) $< -o $@ + $(Q)$(CC) $(CFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d + $(BUILD_DIR)/%.s: %.c @echo "===> GENERATE ASM $@" $(Q)$(CC) -S $(CFLAGS) $< -o $@ diff --git a/pystencils_benchmark/templates/NVCC.mk b/pystencils_benchmark/templates/NVCC.mk new file mode 100644 index 0000000..71010ab --- /dev/null +++ b/pystencils_benchmark/templates/NVCC.mk @@ -0,0 +1,12 @@ +CC = nvcc +LINKER = $(CC) + +# More warning pls +#CFLAGS += -Wfloat-equal -Wundef -Wshadow -Wpointer-arith -Wcast-align -Wstrict-overflow=5 -Wwrite-strings -Waggregate-return +# Maybe too much warnings +#CFLAGS += -Wcast-qual -Wswitch-default -Wconversion -Wunreachable-code +# Specific C flags +CFLAGS := -use_fast_math +DEFINES = -D_GNU_SOURCE -DNDEBUG +INCLUDES = +LIBS = diff --git a/pystencils_benchmark/templates/kernel.c b/pystencils_benchmark/templates/cpu/kernel.c similarity index 100% rename from pystencils_benchmark/templates/kernel.c rename to pystencils_benchmark/templates/cpu/kernel.c diff --git a/pystencils_benchmark/templates/kernel.h b/pystencils_benchmark/templates/cpu/kernel.h similarity index 100% rename from pystencils_benchmark/templates/kernel.h rename to pystencils_benchmark/templates/cpu/kernel.h diff --git a/pystencils_benchmark/templates/main.c b/pystencils_benchmark/templates/cpu/main.c similarity index 100% rename from pystencils_benchmark/templates/main.c rename to pystencils_benchmark/templates/cpu/main.c diff --git a/pystencils_benchmark/templates/gpu/kernel.cu b/pystencils_benchmark/templates/gpu/kernel.cu new file mode 100644 index 0000000..973369f --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.cu @@ -0,0 +1,8 @@ +{% for header in headers %} +#include {{header}} +{% endfor %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_source}} diff --git a/pystencils_benchmark/templates/gpu/kernel.h b/pystencils_benchmark/templates/gpu/kernel.h new file mode 100644 index 0000000..102d9c9 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/kernel.h @@ -0,0 +1,11 @@ +#ifndef {{header_guard}} +#define {{header_guard}} + + + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ + +{{function_signature}}; + +#endif diff --git a/pystencils_benchmark/templates/gpu/main.c b/pystencils_benchmark/templates/gpu/main.c new file mode 100644 index 0000000..1373048 --- /dev/null +++ b/pystencils_benchmark/templates/gpu/main.c @@ -0,0 +1,72 @@ +#include <assert.h> +#include <math.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "timing.h" + +#define RESTRICT __restrict__ +#define FUNC_PREFIX __global__ +#include <cuda_runtime.h> + +//kernels +{% for include in includes %} +#include "{{ include }}.h" +{% endfor %} + +int main(int argc, char **argv) +{ + if(argc < 2) { + printf("Usage: %s <n_repeat>\n", argv[0]); + return -1; + } + int n_repeat = atoi(argv[1]); + {% for kernel in kernels %} + + { // Kernel: {{kernel.name}} + {% for field_name, dataType, elements in kernel.fields %} + {{dataType}} *{{field_name}}; + cudaMalloc(&{{field_name}}, {{elements}}*sizeof({{dataType}})); + cudaMemset({{field_name}}, 0.23, {{elements}}); + {% endfor %} + + {% for constantName, dataType in kernel.constants %} + // Constant {{constantName}} + {{dataType}} {{constantName}}; + {{constantName}} = 0.23; + {% endfor %} + + dim3 blocks({{kernel.blocks[0]}}, {{kernel.blocks[1]}}, {{kernel.blocks[2]}}); + dim3 grid({{kernel.grid[0]}}, {{kernel.grid[1]}}, {{kernel.grid[2]}}); + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = n_repeat; + } + + {% if timing %} + double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; + timing(&wcStartTime, &cpuStartTime); + {% endif %} + + for (; repeat > 0; --repeat) + { + {{kernel.name}}<<<grid, blocks>>>({{kernel.call_argument_list}}); + } + + {% if timing %} + timing(&wcEndTime, &cpuEndTime); + + if( warmup == 0) + printf("%s\t%e\n", "{{kernel.name}}",(wcEndTime - wcStartTime) / n_repeat ); + {% endif %} + } + {% for field_name, dataType, elements in kernel.fields %} + cudaFree({{field_name}}); + {% endfor %} + } + {% endfor %} +} diff --git a/setup.cfg b/setup.cfg index 4063797..8bc42e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,7 +10,10 @@ license = AGPLv3 version = 0.0.1 [options] -packages = pystencils_benchmark +packages = + pystencils_benchmark + pystencils_benchmark.gpu + pystencils_benchmark.cpu install_requires = jinja2 >= 3.0 pystencils >= 0.3.4 diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 70af02e..b47f9c5 100755 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -6,7 +6,9 @@ import tempfile import pytest import pystencils as ps from pathlib import Path -from pystencils_benchmark import generate_benchmark, Compiler + +from pystencils_benchmark import Compiler +import pystencils_benchmark as pb compilers = (Compiler.GCC, Compiler.GCCdebug, Compiler.Clang) @@ -16,6 +18,20 @@ config_kwargs = ({}, 'assume_aligned': True}}) +def nvidia_gpu_available(): + try: + return subprocess.call(['nvidia-smi']) == 0 + except (FileNotFoundError,): + return False + + +def nvcc_available(): + try: + return subprocess.call(['nvcc', '--version']) == 0 + except (FileNotFoundError,): + return False + + @pytest.mark.parametrize('compiler', compilers) @pytest.mark.parametrize('config_kwarg', config_kwargs) def test_generate(compiler, config_kwarg): @@ -34,8 +50,36 @@ def test_generate(compiler, config_kwarg): with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: temp_dir = Path(temp_dir) - generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler) + pb.cpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler) subprocess.run(['make', '-C', f'{temp_dir}'], check=True) subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) +gpu_kwargs = ({}, {'launch_bounds': (256,)}, {'launch_bounds': (256, 2)}) + + +@pytest.mark.parametrize('kwargs', gpu_kwargs) +def test_generate_gpu(kwargs): + compiler = Compiler.NVCC + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + alpha = sp.symbols('alpha') + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def daxpy(): + b[0] @= alpha * a[0] + b[0] + kernel_daxpy = ps.create_kernel(**daxpy) + + with tempfile.TemporaryDirectory(dir=Path.cwd()) as temp_dir: + temp_dir = Path(temp_dir) + pb.gpu.generate_benchmark([kernel_vadd, kernel_daxpy], temp_dir, compiler=compiler, **kwargs) + if not nvcc_available(): + pytest.skip("nvcc is not available!") + subprocess.run(['make', '-C', f'{temp_dir}'], check=True) + if not nvidia_gpu_available(): + pytest.skip("There is no GPU available!") + subprocess.run([f'{temp_dir}/benchmark-{compiler.name}', '10'], check=True) diff --git a/tests/test_launch_bounds.py b/tests/test_launch_bounds.py new file mode 100644 index 0000000..48af06d --- /dev/null +++ b/tests/test_launch_bounds.py @@ -0,0 +1,19 @@ +import numpy as np +import pystencils as ps +from pystencils_benchmark.gpu.benchmark import kernel_header, _add_launch_bound, kernel_source + + +def test_launch_bounds(): + a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) + + @ps.kernel_config(ps.CreateKernelConfig(target=ps.Target.GPU)) + def vadd(): + a[0] @= b[0] + c[0] + kernel_vadd = ps.create_kernel(**vadd) + launch_bounds = (256, 2) + header = kernel_header(kernel_vadd) + header = _add_launch_bound(header, launch_bounds) + assert "void __launch_bounds__(256, 2)" in header + source = kernel_source(kernel_vadd) + source = _add_launch_bound(source, launch_bounds) + assert "void __launch_bounds__(256, 2)" in source diff --git a/ve_example/test.py b/ve_example/test.py index ee5e32b..9bb91d6 100755 --- a/ve_example/test.py +++ b/ve_example/test.py @@ -4,15 +4,16 @@ import subprocess import numpy as np import sympy as sp import pystencils as ps -from pystencils_benchmark import generate_benchmark, Compiler +import pystencils_benchmark as pb from pathlib import Path -def generate(path: Path, compiler: Compiler): +def generate(path: Path, compiler: pb.Compiler): a, b, c = ps.fields(a=np.ones(4000000), b=np.ones(4000000), c=np.ones(4000000)) alpha = sp.symbols('alpha') kernels = [] + @ps.kernel_config(ps.CreateKernelConfig()) def vadd(): a[0] @= b[0] + c[0] @@ -33,20 +34,20 @@ def generate(path: Path, compiler: Compiler): b[0] @= alpha * a[0] + b[0] kernels.append(ps.create_kernel(**daxpy_vector)) - generate_benchmark(kernels, path, compiler=compiler) + pb.cpu.generate_benchmark(kernels, path, compiler=compiler) def make(path: Path): subprocess.run(['make'], check=True) -def execute(path: Path, compiler: Compiler): +def execute(path: Path, compiler: pb.Compiler): subprocess.run([f'./benchmark-{compiler.name}', '100'], check=True) def main(): - compiler = Compiler.GCCdebug - path = Path.cwd() + compiler = pb.Compiler.GCCdebug + path = Path.cwd() / 'generated' generate(path, compiler) make(path) execute(path, compiler) @@ -54,4 +55,3 @@ def main(): if __name__ == '__main__': main() - -- GitLab