From 4e688b867339c90a6fba3ad12c383d1e59211ed9 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 9 May 2025 11:50:36 +0200
Subject: [PATCH] Include reductions WIP state

Squashed commit of the following:

commit d2f04568f355d9fd55b907ee42399747355d63b0
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue May 6 09:43:47 2025 +0200

    Add missing include of gpu atomics

commit 013e6546902cc2e2b1e508d03c52f394a891df37
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 12:24:10 2025 +0200

    Broadcast reduction init val as neutral element

commit d9c3be8594a55ea74b63c320ccf516b627cb510c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 12:23:38 2025 +0200

    Minor typecheck fix

commit 13e2c1181cf0cd9eb3d5b695ca5304c9d7c7048f
Merge: 0a095f83 3e664665
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 11:34:09 2025 +0200

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit 0a095f8322df12f51e08551cb5fca509680a027c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 11:33:56 2025 +0200

    Move simd horizontal op header

commit c6bcdefbcfb2ebe25ea28987827c2f7990aa7057
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 11:31:51 2025 +0200

    Move gpu atomics header

commit e1c2c4636f82fac2c7a7bbc9e65ff998f8a40e3e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 11:30:16 2025 +0200

    Add reductions param to LoopVectorizer

commit 83fcc73a60d210e9a79d2257a6d7359572172757
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon May 5 10:32:32 2025 +0200

    Add reductions param to AddOpenMP

commit c76b897f87b2a4841ab0894249cdaf771d7941b5
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 30 16:09:09 2025 +0200

    Drop ReductionFunctions and introduce PsReductionWriteBack

commit 3e66466514bc1d426056369e21d0ed53580082f5
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Wed Apr 30 15:47:44 2025 +0200

    Apply 1 suggestion(s) to 1 file(s)

    Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>

commit c430559032934ce9234dfad11fea2d60bd37c6ff
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 30 15:39:06 2025 +0200

    Fix missing resolution of ConstantFunctions on GPU platforms

commit 2ca58226cdac171f0373f4cd23b4efb77e9e7505
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 30 15:28:14 2025 +0200

    Minor adaptation of required gpu headers

commit 65362ddfc2bed0a28a7c071962366c206d552d97
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 30 13:55:29 2025 +0200

    Fix typecheck note for match args of PsConstantFunction

commit c0af39ae08afad52498ffd683c5e64cc7fbbf9a9
Merge: a9c8d6a7 d0f8a04e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 30 13:30:40 2025 +0200

    Merge branch 'v2.0-dev' into rangersbach/reductions

commit a9c8d6a7ccf65e79fc57d3e92818becb513ae4d3
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Sat Apr 26 21:01:57 2025 +0200

    Apply 4 suggestion(s) to 2 file(s)

    Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>

commit 0e2a548a372bc95cbface7e0a7b22c24aa04d289
Merge: a14f13fb 9c5b9bce
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 17:42:42 2025 +0200

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit a14f13fb6bd17fb9dbe8e4db00a3ebd3fd2391a1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 17:39:22 2025 +0200

    Omit extra type context creation for scalar op in PsVecHorizontal

commit 9c5b9bce9065759461bb0c6ed2e88b4a665789d6
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 16:35:17 2025 +0200

    Omit unnecessary type context creations for PsVecHorizontal

commit 508701f6d3f352d0adfb655b8ea9aae3a15a5745
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 16:32:16 2025 +0200

    Fix lint

commit 3757e179c0e3b73f715819dbef1c316ba1b13af8
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 16:27:07 2025 +0200

    Omit unnecessary replace_symbol call

commit f3105780593ce575581cc6231c7b02f87fe40319
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 16:15:37 2025 +0200

    Add unit test for ReductionAssignment

commit 4106f9fdf0a4f1c0c6806ac3bca2a6ae467d31d1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:57:33 2025 +0200

    Parameterize test_reduction_assignments with reduction ops

commit 490ec9144b01c10c6a6e8d1a927fd42e37a56511
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:51:08 2025 +0200

    Omit old stuff from reduction_op_to_expr

commit 394836094f59bbb248a0c385510ff774a2d70457
Merge: 1580a2b0 c0df001f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:45:27 2025 +0200

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit 1580a2b06b0aa060def7428cf5ac3f7938b81f92
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:45:11 2025 +0200

    Adapt test_invalid_reduction_assignments to create new contexts for each subtest

commit 325ca38652751eccdd33fe2cc9dc55735c6dfc0c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:44:08 2025 +0200

    Move checks and init value determination for ReductionAssignments to add_reduction_info

commit 0c8654e3251dea9c4d0ca9e3fa14f54dc970564b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:36:58 2025 +0200

    Extend KernelAnalysis to check for invalid references to reduction symbols

commit 705ac53161f9a2fe49a2898de36f118a79e2d6a2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Apr 25 15:19:00 2025 +0200

    Minor doc change

commit c0df001f196655dd5b3ada6f1fa3909b90584abd
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Fri Apr 25 12:59:07 2025 +0200

    Apply 1 suggestion(s) to 1 file(s)

    Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>

commit da00b78ff50a2b5e61ae0d9ad53f56d3a6bc4c4d
Author: Frederik Hennig <frederik.hennig@fau.de>
Date:   Fri Apr 25 12:58:57 2025 +0200

    Apply 1 suggestion(s) to 1 file(s)

    Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>

commit 3010daed816e9fe4ed9b685b1f666c75736d8222
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 21:08:49 2025 +0200

    Add typification test for PsVecHorizontal

commit 94dcf3c5362238f1203b88f8a6a5873431749dc5
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 20:47:07 2025 +0200

    Add unit test for freezing of ReductionAssignments

commit f678a2fa2c38e2fa202a2c2a12472cdcc229b3d2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 20:36:36 2025 +0200

    Add unit test for checking border cases of freezing illegal usages of ReductionAssignments

commit eb6e8c0f8c98d00cb56e6dfc7b7add5036e766ae
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 20:35:16 2025 +0200

    Check if reduction symbol is (illegally) accessed before/after reduction assignment

commit f0aba2e948fef8f535644e80b0f7d35e0e5f60e0
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 20:06:25 2025 +0200

    Document attributes of ReductionInfo

commit 081d11ad152bfa6e1f809af7d20789f00d39e5a9
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 19:10:21 2025 +0200

    Fix indent for error handling in freeze

commit dae36dd6edb5140d3bb616d5ade0d61b768d03fb
Merge: 603e6a3f 6046db6e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 18:53:39 2025 +0200

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit 6046db6e13ddd875e7c68edd9f99fb7a7101f47f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Apr 24 17:54:31 2025 +0200

    Move symbol handling for reductions to context and add more checks

commit 603e6a3fce53a0bbd4d2b21faa12c34da3936b1b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 16:46:27 2025 +0200

    Fix docs

commit b0904d93d48f0a165e4b1561e3b267db09986221
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 16:45:26 2025 +0200

    Fix typecheck

commit 3c3283cd4a823b7aeac883a794c2e2fd05e984e2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 15:44:30 2025 +0200

    Merge reduction user guides into one document

commit b4cabdd81a3e94f2881cb26798d6588badb1c377
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 14:57:25 2025 +0200

    Add consistency check for PsVecHorizontal in typifier

commit 935beb559495c5094a42cf0e807ca8ba203beb6d
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 14:41:04 2025 +0200

    Move kernel AST modifications for reductions to distinct function

commit 8394f0f4c115cb37c5e53f3e08f62a32bd128a3e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 14:18:29 2025 +0200

    Move reduction_op_mapping.py

commit 7b74cafaedc589a52f91e7a28d60528956edd19f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 13:01:20 2025 +0200

    Fix import

commit 9e61ccebe4e6a37205441f94573f714e987d17cd
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 12:58:38 2025 +0200

    Omit lanes for match args in PsVecHorizontal

commit 36124cd24e89342ae96dd0bee8a05bdfc2542092
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 11:58:34 2025 +0200

    Add check for typed symbols for ReductionAssignment constructor

commit 427e53442e166cf34a23e7588a1f8cd72ce52438
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 11:35:47 2025 +0200

    Remove more parts of reduction_assignment_from_str

commit f7142b16ae26f30f96f07e0063c65389fbc34b3f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 11:32:47 2025 +0200

    Remove lanes arg from PsVecHorizontal

commit 99726a97a0a834dfb53960ad1fb72be4f72849e2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 11:30:29 2025 +0200

    Add docstring to PsVecHorizontal

commit 21df6f4b5915134b537c20398b8bdeec00b6b28e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 10:57:43 2025 +0200

    Omit admonitions from docs

commit a2060520faddc99c58c8f22eedfe267d09d525b1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 23 10:54:44 2025 +0200

    Get rid of reduction_assignment_from_str

commit bd99cd11f489ac7ae7a5dd602df69b4b087d0f08
Merge: 5bef84a8 9d634352
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 2 19:47:55 2025 +0200

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit 5bef84a8e28279c46cd73b67352201622d9aa89e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 2 19:47:49 2025 +0200

    Write howto guide for reductions

commit f469e70e18c278385ab4237b3f30afb70da4c606
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Apr 2 19:19:28 2025 +0200

    Extend test_reduction_gpu with assume_warp_aligned_block_size and use_block_fitting parameters

commit 9d6343522ea1d79e6b79c170d69fbf8d4108a78a
Merge: 492fb30a b008a9e9
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 31 14:14:38 2025 +0200

    Merge branch 'rangersbach/reductions' of i10git.cs.fau.de:pycodegen/pystencils into rangersbach/reductions

commit 492fb30a9da935fe0273201de79e90b1eb1ae338
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 31 13:53:34 2025 +0200

    Adapt error messages for reduced assignments in freeze.py

commit 3484e7f794e4e720d7bc5931b90a4b7caf2ffc59
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 31 13:49:47 2025 +0200

    Rename compound op mapping funcs

commit 7b43ffd2e12f5d69ccc30fdf451ee47d7caea6ae
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 31 13:45:15 2025 +0200

    Add minor comment to ReductionInfo dataclass

commit b008a9e9954b83fd371c572c321b26597211a9c1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Mar 28 13:26:09 2025 +0100

    Adapt guards for generated avx512 horizontal ops

commit 806dcb6b2aaa42849d8af7f12b62b730eec7fa0e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 27 17:02:57 2025 +0100

    Move resolution of reductions to concrete gpu platform classes

commit 4e5c89b9cd23610f27d61612b14a12e968724e3f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 24 16:54:50 2025 +0100

    Fix lint, typecheck

commit f77909540a9d26ce9dc1decde5c9508bc1f2d14a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Mar 24 16:22:40 2025 +0100

    Use NPP library for numeric limits for CUDA, use std limits for HIP

commit c31d407433bb075a42691866e87e59208bf99d90
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 18:56:41 2025 +0100

    Try fixing required headers for cuda and hip for reductions

commit 6a6e57f08bec708036387c78919f7d4e028d86d4
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 18:19:03 2025 +0100

    Fix wrong rank being used for obtaining default block sizes

commit 2c507d796f0c5abbff32386f4268d3bdb988c6fa
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 18:18:10 2025 +0100

    Fix required headers for cuda/hip platforms

commit 16a6e80d32e7ff12e151fd75ec6f2905903620df
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 17:11:13 2025 +0100

    Add missing type fold for loop vectorizer again

commit 5ee715d0df2651e745ff5de0524abfe24d48c968
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 17:04:47 2025 +0100

    Reformat adapted files [skip ci]

commit d7e6890cff8f327ce285a3360821a4341b2acc46
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 16:23:54 2025 +0100

    Fix typecheck

commit 9ec813bdd1c8dfd2a1f32bb300d6f9e7d172542f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 16:07:01 2025 +0100

    Employ optimized warp-level reduction based on check

commit 90837d04eaad5b3ad049c11fa5af48cf0942e812
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 15:36:01 2025 +0100

    Merge handling for GPU reductions into generic_gpu.py for the time being

commit 0d7526c07b744f3cd1c7a692b794bd6e7f8bfc37
Merge: 974cf848 f8e5419f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Mar 20 15:06:46 2025 +0100

    Merge branch 'v2.0-dev' into rangersbach/reductions

commit 974cf848bfe474a5003e95c907e3e1289b6a5454
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Mar 12 16:40:25 2025 +0100

    Fix header incl

commit c7f1518efc14326f9b5732e7e8f6ac7f07acc71a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Mar 12 16:28:42 2025 +0100

    Move manual atomic op implementations to new header

commit 3202d8b8f4c6902660330806e18a93321ac8f928
Merge: 4a031fc1 cd25546f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Mar 12 16:23:59 2025 +0100

    Merge branch 'v2.0-dev' into rangersbach/reductions

commit 4a031fc192fc3f7d30fde450e8ff1788d3ecc3dd
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Feb 27 11:49:00 2025 +0100

    Fix lint

commit a972d759e7e6e8e8e0c6241a70456184a1366143
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 26 19:15:45 2025 +0100

    Fix getter for thread exec condition for dense/sparse iteration spaces in cuda.py

commit 02be4d5e994e458e0e42d72045ec53355e7820d1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Feb 21 19:35:48 2025 +0100

    Fix typecheck

commit 89a6f36ad50849267b3a95ef5a035d22566c5748
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Feb 21 18:17:44 2025 +0100

    Fix lint

commit a8479afadc2a95e2ecda861ce7fd186375477347
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Feb 21 18:11:51 2025 +0100

    Refactor reductionfunction mechanism

commit bcd83842f3d331fa039a96e3eab68c34b3beb6f3
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Feb 21 14:56:46 2025 +0100

    Use full mask for CUDA reductions

commit 2b6589b8fd5eea2e416432c9261884e9f50e4e1c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 19 19:06:30 2025 +0100

    Introduce masks for warp reductions and fix errors when shuffling warp results

commit c7b564bed65615c77f1065dd4821b5a79d2244b3
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 19 19:04:57 2025 +0100

    Fix CUDARuntimeError import

commit ad292c2b42e01c38843eb1a5556ea2ced762f845
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 19 16:34:14 2025 +0100

    Add initial version of warp-level reduction for CUDA

commit ce816539159408b26b09b4bf6e1df0cbff437829
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 19 16:33:25 2025 +0100

    Encapsulate fetching of kernel conditions for iteration spaces in separate function

commit d10c65d43919f5ee611fc4a0ef7c79f3c3e65822
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 18 17:29:22 2025 +0100

    Catch CUDARuntimeError for mising CUDA capable device in reduction GPU test

commit 99125ca7de57e2d1880b355f88c7136d923fa103
Merge: 77a22268 c2c31e84
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 18 17:09:02 2025 +0100

    Merge branch 'v2.0-dev' into rangersbach/reductions

commit 77a2226818946a666f98b23e95a76c16b77c1a82
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 18 12:57:49 2025 +0100

    Avoid duplicate definition of atomicMin/Max for HIP

commit 0c40ed634587f5854d11f7851c7ba9eb5911f4ff
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 18:31:41 2025 +0100

    Use import or skip mechanism for cupy

commit ef185b4e48f0c0ca40aaba84c928289995537f45
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 18:18:42 2025 +0100

    Fix ImportError for cupy

commit d14898373c0503bd9bbde0d3c0ee35888519f11f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 17:54:40 2025 +0100

    Fix typecheck

commit 13569a616ef3eb6a28a698e8163f0748d4a4c0c0
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 17:31:23 2025 +0100

    Fix lint

commit 71881893e551cfa3c5ce2217b8fad0ef751e3613
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 17:16:09 2025 +0100

    Split reduction test into separate CPU/GPU tests

commit fe3cd6cd19f2c11166e2b8f670a0a8a337b898ef
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 16:02:05 2025 +0100

    Add generator for SIMD horizontal operations and the emitted code.

commit 58cdb79206931dfe15a4fdb7f656d8d047a2a6b8
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 17 15:30:42 2025 +0100

    Fix bug with doubly inverted sign for subtraction reductions

commit 7306f4ddfbb9066494480f14cdb211ae657f79ba
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 12 15:41:34 2025 +0100

    Minor fix

commit 8e0a74784ccb0d22c89364871ee630ba67239b2e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 12 15:22:45 2025 +0100

    Refactor PsVecHorizontal as PsBinOp

commit eb7823a5b28589a45f968ac6415ee2a02543a3ab
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Feb 12 14:13:05 2025 +0100

    Minor refactor of reduction ops

commit b4b105be0ff674bd1cacaea75e5f92bf8a7fda3c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 11 17:21:54 2025 +0100

    Minor fix

commit a959082925c8c55d6116cc91e263fc862ff50fb7
Merge: d4b7e78f 60a348f1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 11 17:17:45 2025 +0100

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit d4b7e78fca17ca130c55959e330a161d07ebba80
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 11 17:16:23 2025 +0100

    Add initial implementation for horizontal reductions for vectorization

commit 60a348f1f3f39d184da872007756c3aeed13ccee
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 10 18:30:29 2025 +0100

    Support atomic sub, min, max for fp reductions using custom implementations with CAS mechanism

commit a71e0d318a6f96bef38ce4b7b260d8f3fd73d91d
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 10 18:03:52 2025 +0100

    Temporarily change default CUDA block size for CUDA jit

commit 5caafdd05712621bc990ce7ef13c08f232f260af
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Feb 10 15:13:51 2025 +0100

    Add guard for INFINITY numeric limit macro used by cuda backend

commit 4edb0f9794ac8ad5c8329f2f3f89026aeb18a6b1
Merge: a2a59d40 4876f0b7
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Feb 6 15:57:14 2025 +0100

    Merge branch 'v2.0-dev' into rangersbach/reductions

commit a2a59d40b66390cebe849870d1b9cf058da82850
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 4 18:04:15 2025 +0100

    Wrap statement around generated atomic call [skip ci]

commit d8ae900242264392479f4405678d9a1f1b177890
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Feb 4 17:49:37 2025 +0100

    Use predefined macro values for numeric limits in cuda backend

commit f60d9d5df3c87c54d587ad7496025e70ee2388f0
Author: Richard Angersbach <iwia025h@csnhr.nhr.fau.de>
Date:   Tue Feb 4 16:31:16 2025 +0100

    Minor adaptations for reduction test

commit 826ee8e26f2bb09b95bb473b347d06cd6a36207c
Author: Richard Angersbach <iwia025h@csnhr.nhr.fau.de>
Date:   Tue Feb 4 16:30:13 2025 +0100

    Try supporting pointer dtypes for reductions in cupy gpu jit

commit 4c7fd40921a2486f8164efc84e455b59138ae6d1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 31 14:56:08 2025 +0100

    Fix lint [skip ci]

commit 616f609f24551439403c13b0d282fab147099f9f
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 31 14:51:45 2025 +0100

    Fix lint [skip ci]

commit 0fb11858f2c65fc46c8dca469c75c28bf283dfdb
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 20:06:00 2025 +0100

    Add CUDA backend for numeric limits

commit 10def05e256d2c316f5063e48bbbfebe2c84ea85
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 19:59:46 2025 +0100

    Fix typecheck

commit e15d3cf7a043fc45b78747d85e035dcd314bcd42
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 19:15:13 2025 +0100

    Add first CUDA reduction impl using atomic operations

commit 6e08683b7e5fe681986a677a014cbcadafa017f2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 18:04:02 2025 +0100

    Add dummy implementations for unfold_function in cuda/sycl platforms

commit dd8f421d774f59d1dd36d4fdcb76a51375f4bda9
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 16:38:46 2025 +0100

    Introduce functions to be unfolded by platform into code blocks for reduction init and write-back

commit 53807242b8f2eae36b75abb3d101d53b3948157a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 30 16:20:25 2025 +0100

    Remove orig_symbol from reduction info as it is not needed

commit a9da7d432d3bb5ed03da5f08f7b5dbd94c17ff7d
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 29 11:56:00 2025 +0100

    Fix lint [skip ci]

commit 06dc234497d8860c4c7fde704ba26c8c3da030a1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 29 11:48:26 2025 +0100

    Use std::numeric_limits as NumericLimitsFunctions backend for cpu

commit 2424c15725cd621fd8bfa573f928b82255a17693
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 20:05:51 2025 +0100

    Move reduction properties for local and ptr variable into single dataclass

commit e94c4980ed860b127a27743fd9727192d667c906
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 19:18:00 2025 +0100

    Adapt reduction assignment interface and employ enums instead of strings for the binary operation employed

commit f71ce708a1aaa876a700e880d8cd1b63a0d080ee
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 18:00:53 2025 +0100

    Enforce usage of typed symbols for reductions

commit c73deaf6d54da2b95310dc1e606ed132b465c874
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 13:11:14 2025 +0100

    Fix mypy errors and move binop mapping function

commit 3daaa5e5a1f92a5482cb838cd791ed062dae1398
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 12:47:15 2025 +0100

    Fix typecheck

commit 96b5cbf286a29882b496faee3b9fe3be481d8bb3
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 28 12:44:21 2025 +0100

    Fix lint

commit f0d2fde6848f9cddcf0c3b38ca169f2e85abc093
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 24 16:20:37 2025 +0100

    Encapsulate mapping of binop strings to actual operands and now also use for considering initial value of passed reduction pointer value

commit 4c726aa6aa2df6312252cb848ace95e790d62331
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 24 14:43:32 2025 +0100

    Prepare reduction test for GPU support

commit b352a2e2e8c2d7f4eeb0861dacff5d703ae51869
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 23 18:18:24 2025 +0100

    Fix lint for jit/cpu_extension_module.py

commit 5ca40eae11e6d018b165286183b8a8832348b808
Merge: 3e595df6 72fa8672
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 23 18:11:54 2025 +0100

    Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions

commit 3e595df6c79cc1a7a8c2ff4ab86825e81aadbf43
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 17:35:50 2025 +0100

    Refine test_reduction.py to check for result correctness

commit ba697180cac45133f756364ef8798d8437852026
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 17:23:02 2025 +0100

    Rewire existing code extraction of fields to support reduction pointer extraction

commit f1c556e6f93d5fa042e12e8a0a9c57f3bdea47b7
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:30:43 2025 +0100

    Integrate reduction pointers to parameters.py

commit 777ab888d5032d7630827f91fd26c388a9a09db2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:01:51 2025 +0100

    Use literals for C macros used for the numeric limits

commit 3e0daa67359c7ddc17264b7fd21aa0a0429552e5
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:01:00 2025 +0100

    Propagate properties of reduction pointer symbols to kernel parameters

commit c6eedfcda96e84e8279ee624ba3c113f2339bfbe
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 15:16:32 2025 +0100

    Split reduction var property into local and pointer-based reduction var properties

commit c51ae2b438d688f871ba45b69476ef0c3b475462
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 18:42:43 2025 +0100

    Set type of reduced variable to pointer and write back via PsMemAcc

commit 3c276118bec2981a9bfc5a0d9654fec358094269
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 17:34:06 2025 +0100

    Fix declaration of local reduction var and write back to original variable

commit 0a9abc2a24ec30a444613178a94382da5355a6ef
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 13:55:35 2025 +0100

    Swap out neutral init values for reduced assignments with min/max op

commit 45ab4e86617492462188a5fc46d3160450a54bf0
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Jan 20 17:46:49 2025 +0100

    Try initializing kernel-local reduction variable copy

commit 6a7a251f77d0274c32729bc5bfacfe0308d3fec9
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 15:06:57 2025 +0100

    Try fix mypy no-redef error

commit 2c15b9890291deea0cb929ae3a5221f3a0671a45
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 15:02:19 2025 +0100

    Define type of init_val for reduction as Any

commit dcdfff042120db9960f1859a60fdeb1890f02878
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:56:12 2025 +0100

    Parameterize test_reduction.py for different reduction operations

commit ffcd54e053a2ddda2e015d2836184f2fcaef59f3
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:55:42 2025 +0100

    Fix min/max reductions

commit b4dd0c8c55d26f87b4467f814c526fafc2ced76b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:29:10 2025 +0100

    Fix usage of numerical limits for init value of reduction

commit 6f8fbdfe7a6cbf14c6f64a86315fa435ed0c9336
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 13:43:32 2025 +0100

    Fix removal of function parameters for lhs symbols that are not declared in the kernel

commit 55c9812023c384c7978f2536020d8587b7a12019
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 18:38:34 2025 +0100

    Adapt comment of ReductionSymbolProperty

commit 355d638aab1179db95204d490fe585f7f4fcb7c1
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 18:15:58 2025 +0100

    Rename min/max of numeric limits enum

commit 97c171f3cd5371152fd102d92e302fc13ae4ee2b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 17:38:46 2025 +0100

    Adaptations to reduction test

commit dc5898a77e7b72a2221af8b66fa56640c2516e76
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 17:30:25 2025 +0100

    Omit distinction between normal and reduced assignments in AssignmentCollection

commit 8fb5af398d208cf9779f7b0c528237d0d79ef7fb
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:20:14 2025 +0100

    Fix header include of limits.h

commit f30ca33b9ea897e66c996759a40b1aebf43a3688
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:19:23 2025 +0100

    Fix inheritance of special math function enum classes

commit 71aaf722f0a3aecfcc79e2633311aa8f5677b42d
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:18:37 2025 +0100

    Add back reduced_assign to sympyextensions interface

commit f00708edcaec67412905fbd26842735152da9812
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:15:24 2025 +0100

    Reformat reduction.py

commit 99a3335135baf5da92ec56713f96beae37798b60
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 15:52:05 2025 +0100

    Add omp reduction clauses for reduced symbols

commit c96a94619ff058ba81d798b6f4bfbe06cd273535
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 13:33:12 2025 +0100

    Add C function selection for numeric limits functions

commit e5861425c5e2e4868e0c696763297768814c02ec
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 12:59:16 2025 +0100

    Minor adaptation on how symbols are given reduction property

commit 3d592ab07194bc623884b81413278d954434e89b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:51:22 2025 +0100

    Try fixing circular module import

commit d8a717874a0156d0c27c75b04c859b3a4a98268d
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:50:51 2025 +0100

    Add dictionary of reduced symbols to codegen context

commit 35c8160bf5f9da900255583bbfb10f935d5b3687
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:19:09 2025 +0100

    Minor import fixes

commit 548375295a08f88bef9ba8e597c47fe7143d8139
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:42:00 2025 +0100

    Introduce reduction symbol property and add to lhs of reduced symbol

commit b263d752d8af83569517aa56ef17facfa26adc91
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:39:39 2025 +0100

    Add functions for numeric limits (to be supported by the backends)

commit fae371d48773f9423855be824f495f3f20abcdc2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:36:18 2025 +0100

    Get rid of reduction using the division operator

commit c7b9bb522828f8d5e32d3c20ad7d17eee689d2eb
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 12:59:36 2025 +0100

    Expose new reduced assignments to pystencils interface

commit f54fa321869682e71edd3f2828725104de9abe36
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 11:36:53 2025 +0100

    Fix relative module imports for newly introduced sympyextensions for reductions

commit 6b8bff09ef5b1c3f451aacae6aa730a9a822f5b2
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 14 18:13:18 2025 +0100

    Initial work for introducing reduction capabilities to pystencils

    Signed-off-by: zy69guqi <richard.angersbach@fau.de>

commit 72fa86729d84028e8b900e8a9cd0f9a8cdfab401
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 17:35:50 2025 +0100

    Refine test_reduction.py to check for result correctness

commit 4f6f5580bc5cb302e1a064a9d642e66822155db6
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 17:23:02 2025 +0100

    Rewire existing code extraction of fields to support reduction pointer extraction

commit 4e748308aca64b02cfa689fb0356a9a38d8c35af
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:30:43 2025 +0100

    Integrate reduction pointers to parameters.py

commit 6c8ee44f148f13b456c0c9c754e434cc9cf5b59a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:01:51 2025 +0100

    Use literals for C macros used for the numeric limits

commit 350a4eac9394ca7bb95231b5ad14a1526952e23c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 16:01:00 2025 +0100

    Propagate properties of reduction pointer symbols to kernel parameters

commit b5dd2ef085b19e505d0331d8aa8f6dae9ee85eb0
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 22 15:16:32 2025 +0100

    Split reduction var property into local and pointer-based reduction var properties

commit 6bc3cf3f17ed3395dabc15a2207739de245cd038
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 18:42:43 2025 +0100

    Set type of reduced variable to pointer and write back via PsMemAcc

commit 9fd1c2ad9bb4f8c225627306f0369273092ef737
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 17:34:06 2025 +0100

    Fix declaration of local reduction var and write back to original variable

commit e0347f54f8b0b713525cfa473a7df63cb98ca8cb
Merge: 90ca9ead 3fc9a049
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 17:11:10 2025 +0100

    Merge branch 'rangersbach/reductions' of i10git.cs.fau.de:pycodegen/pystencils into rangersbach/reductions

commit 3fc9a049683b0cbac6bfa721efd38db05c201236
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 21 13:55:35 2025 +0100

    Swap out neutral init values for reduced assignments with min/max op

commit 90ca9ead0199cd4f5988e6c43e9c9c5350f566b6
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Mon Jan 20 17:46:49 2025 +0100

    Try initializing kernel-local reduction variable copy

commit 75ea862f50d5372126d1a934b4b1e15ba3dc8c85
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 15:06:57 2025 +0100

    Try fix mypy no-redef error

commit 3c5a93b4a0016a2c12e0f210fd1324816e6df15e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 15:02:19 2025 +0100

    Define type of init_val for reduction as Any

commit 9bbb8181dcd0717dd61de853f33e11c1ca19d806
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:56:12 2025 +0100

    Parameterize test_reduction.py for different reduction operations

commit a3025645e30265621d41315e7a0449768225c361
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:55:42 2025 +0100

    Fix min/max reductions

commit bb984679607d2f3625849667f06c348245b1813a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 14:29:10 2025 +0100

    Fix usage of numerical limits for init value of reduction

commit fff5a079171ad884069cd307f2777a2dc0aa68f6
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Fri Jan 17 13:43:32 2025 +0100

    Fix removal of function parameters for lhs symbols that are not declared in the kernel

commit 1a1c23b57015ace0c821d0a709d2bf17fa67b42a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 18:38:34 2025 +0100

    Adapt comment of ReductionSymbolProperty

commit f16d8e7978174c24f682202b65aa64e1d53003bb
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 18:15:58 2025 +0100

    Rename min/max of numeric limits enum

commit e9ee769d2b8d2f177611d6ca5c2ccc0394d83874
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 17:38:46 2025 +0100

    Adaptations to reduction test

commit 9a8e6f9bb9a14a144f80b678147c1a4c36456741
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 17:30:25 2025 +0100

    Omit distinction between normal and reduced assignments in AssignmentCollection

commit 9741c024245137405c2c7b09db63267d88d6c12b
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:20:14 2025 +0100

    Fix header include of limits.h

commit cf2ec0662b8ea423c252c71e4e926e7fc388d4da
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:19:23 2025 +0100

    Fix inheritance of special math function enum classes

commit ef9239ede7d00457d90aab5a1740894dfc47d21a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:18:37 2025 +0100

    Add back reduced_assign to sympyextensions interface

commit 555a6a836408071ca32c705bdf0fdf5e5e610437
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 16:15:24 2025 +0100

    Reformat reduction.py

commit a16969bf5c054b93c5e8a1a69a291d8437bbaa35
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 15:52:05 2025 +0100

    Add omp reduction clauses for reduced symbols

commit 4ae330dc8e87ac8cbb83c6f402ab99cdcb9e9edb
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 13:33:12 2025 +0100

    Add C function selection for numeric limits functions

commit af855492661c5649d3286eb2153f369b3813fb88
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Thu Jan 16 12:59:16 2025 +0100

    Minor adaptation on how symbols are given reduction property

commit b8718cb1d67b14b39b9d806ff3131179fa97e24e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:51:22 2025 +0100

    Try fixing circular module import

commit 53fc7ca4c0ad2e601a09050b3273781ded65542a
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:50:51 2025 +0100

    Add dictionary of reduced symbols to codegen context

commit 66ce43954c585e5d576f895ab0d60f62196db813
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 17:19:09 2025 +0100

    Minor import fixes

commit 719a76fba40197320d03bab06e1d139e6d24a724
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:42:00 2025 +0100

    Introduce reduction symbol property and add to lhs of reduced symbol

commit 778cfd51b4df56c7fccf3686b0c0d3273b43a202
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:39:39 2025 +0100

    Add functions for numeric limits (to be supported by the backends)

commit ba1458538a9c954803d26337f7b428f599421f2c
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 16:36:18 2025 +0100

    Get rid of reduction using the division operator

commit 558a0f20e082370a0bccd20b96a647e3536bc31e
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 12:59:36 2025 +0100

    Expose new reduced assignments to pystencils interface

commit 543bf118944b32b851b526964ee275d7a1808034
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Wed Jan 15 11:36:53 2025 +0100

    Fix relative module imports for newly introduced sympyextensions for reductions

commit 8a2bf6648548c18ab47d71db429956e2bf1f52b6
Author: zy69guqi <richard.angersbach@fau.de>
Date:   Tue Jan 14 18:13:18 2025 +0100

    Initial work for introducing reduction capabilities to pystencils

    Signed-off-by: zy69guqi <richard.angersbach@fau.de>
---
 docs/source/backend/gpu_codegen.md            |   1 +
 docs/source/index.rst                         |   1 +
 docs/source/user_manual/reductions.md         | 202 ++++++++++++
 src/pystencils/__init__.py                    |  21 +-
 src/pystencils/backend/ast/analysis.py        |   2 +-
 src/pystencils/backend/ast/vector.py          |  82 ++++-
 src/pystencils/backend/emission/ir_printer.py |  15 +-
 src/pystencils/backend/functions.py           |  35 +-
 .../backend/kernelcreation/analysis.py        |  17 +
 .../backend/kernelcreation/context.py         |  88 ++++-
 .../backend/kernelcreation/freeze.py          |  58 +++-
 .../backend/kernelcreation/typification.py    |  36 +-
 src/pystencils/backend/platforms/cuda.py      | 110 ++++++-
 .../backend/platforms/generic_cpu.py          |  47 ++-
 .../backend/platforms/generic_gpu.py          | 139 ++++++--
 src/pystencils/backend/platforms/hip.py       |  16 +-
 src/pystencils/backend/platforms/platform.py  |   7 +-
 src/pystencils/backend/platforms/sycl.py      |   7 +-
 src/pystencils/backend/platforms/x86.py       |  38 ++-
 .../backend/reduction_op_mapping.py           |  24 ++
 .../backend/transformations/add_pragmas.py    |  16 +
 .../transformations/loop_vectorizer.py        |  68 +++-
 .../transformations/select_functions.py       |  44 ++-
 .../transformations/select_intrinsics.py      |  16 +-
 src/pystencils/codegen/driver.py              |  60 +++-
 src/pystencils/codegen/gpu_indexing.py        |   4 +-
 .../pystencils_runtime/bits/gpu_atomics.h     |  90 +++++
 .../include/pystencils_runtime/cuda.cuh       |   1 +
 .../include/pystencils_runtime/hip.h          |   1 +
 .../simd_horizontal_helpers.h                 | 231 +++++++++++++
 src/pystencils/jit/cpu_extension_module.py    |  63 +++-
 src/pystencils/jit/gpu_cupy.py                |  10 +-
 src/pystencils/simp/assignment_collection.py  |   1 +
 src/pystencils/sympyextensions/__init__.py    |   5 +-
 src/pystencils/sympyextensions/reduction.py   |  82 +++++
 tests/frontend/test_sympyextensions.py        |  38 ++-
 tests/kernelcreation/test_reduction.py        |  93 ++++++
 .../nbackend/kernelcreation/test_analysis.py  |  38 +++
 tests/nbackend/kernelcreation/test_freeze.py  |  83 +++++
 .../kernelcreation/test_typification.py       |  47 ++-
 util/generate_simd_horizontal_op.py           | 309 ++++++++++++++++++
 41 files changed, 2112 insertions(+), 134 deletions(-)
 create mode 100644 docs/source/user_manual/reductions.md
 create mode 100644 src/pystencils/backend/reduction_op_mapping.py
 create mode 100644 src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h
 create mode 100644 src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h
 create mode 100644 src/pystencils/sympyextensions/reduction.py
 create mode 100644 tests/kernelcreation/test_reduction.py
 create mode 100644 tests/nbackend/kernelcreation/test_analysis.py
 create mode 100644 util/generate_simd_horizontal_op.py

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 3fe00840e..a95c36566 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -1,3 +1,4 @@
+(gpu_codegen)=
 # GPU Code Generation
 
 The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 6dba50af1..4e1070979 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,6 +81,7 @@ Topics
 
   user_manual/symbolic_language
   user_manual/kernelcreation
+  user_manual/reductions
   user_manual/gpu_kernels
   user_manual/WorkingWithTypes
 
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
new file mode 100644
index 000000000..5b45a921c
--- /dev/null
+++ b/docs/source/user_manual/reductions.md
@@ -0,0 +1,202 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell, raises-exception]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import cupy as cp
+
+from enum import Enum
+```
+
+(guide_reductions)=
+# Reductions in Pystencils
+
+Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, 
+such as computing sums, products over an array or finding its minima or maxima.
+
+## Specifying Assignments with Reductions
+
+In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`.
+Here is a snippet creating a reduction assignment for adding up all elements of a field:
+
+```{code-cell} ipython3
+r = ps.TypedSymbol("r", "double")
+x = ps.fields(f"x: double[3D]", layout="fzyx")
+
+assign_sum = ps.AddReductionAssignment(r, x.center())
+```
+
+For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the 
+right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions.
+
+**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.**
+
+The following reduction assignment classes are available in pystencils:    
+* `AddReductionAssignment`: Builds sum over elements
+* `SubReductionAssignment`: Builds difference over elements
+* `MulReductionAssignment`: Builds product over elements
+* `MinReductionAssignment`: Finds minimum element
+* `MaxReductionAssignment`: Finds maximum element
+
+:::{note}
+Alternatívely, you can also make use of the `reduction_assignment` function
+to specify reduction assignments:
+:::
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import reduction_assignment
+from pystencils.sympyextensions.reduction import ReductionOp
+
+assign_sum = reduction_assignment(r, ReductionOp.Add, x.center())
+```
+
+For other reduction operations, the following enums can be passed to `reduction_assignment`.
+
+```{code-cell} python3
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Min = "min"
+    Max = "max"
+```
+
+## Generating and Running Reduction Kernels
+
+With the assignments being fully assembled, we can finally invoke the code generator and 
+create the kernel object via the {any}`create_kernel` function.
+
+### CPU Platforms
+
+For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled.
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel = ps.create_kernel(assign_sum, cfg)
+
+ps.inspect(kernel)
+```
+
+:::{note}
+The generated reduction kernels may vary vastly for different platforms and optimizations.
+You can find a  detailed description of configuration choices and their impact on the generated code below.
+:::
+
+The kernel can be compiled and run immediately.
+
+To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field
+but also one for exporting reduction results. 
+The export mechanism can be seen in the previously generated code snippet. 
+Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`).
+This pointer is used for exporting the reduction result back from the kernel.
+Please note that the **values passed via pointer will not be overwritten** 
+but will be incorporated in the reduction computation.
+Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
+
+```{code-cell} ipython3
+    kernel_func = kernel.compile()
+
+    x_array = np.ones((4, 4, 4), dtype="float64")
+    reduction_result = np.zeros((1,), dtype="float64")
+
+    kernel_func(x=x_array, r=reduction_result)
+    
+    reduction_result[0]
+```
+
+### GPU Platforms
+
+Please note that **reductions are currently only supported for CUDA platforms**.
+Similar to the CPU section, a base variant for NVIDIA GPUs without 
+explicitly employing any optimizations is shown:
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    kernel_gpu = ps.create_kernel(assign_sum, cfg)
+
+    ps.inspect(kernel_gpu)
+```
+
+The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer 
+now require device memory, i.e. instances of {any}`cupy.ndarray`.
+
+## Optimizations for Reductions
+
+Going beyond the aforementioned basic kernel configurations,
+we now demonstrate optimization strategies for different platforms 
+that can be applied to reduction kernels and show what impact they have.
+
+### CPU Platforms
+
+For CPU platforms, standard optimizations are employing SIMD vectorization and shared-memory parallelism using OpenMP.
+The supported SIMD instruction sets for reductions are:
+* SSE3
+* AVX/AVX2
+* AVX512
+
+Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`.
+**Note that reductions require `assume_inner_stride_one` to be enabled.**
+This is due to the fact that other inner strides would require masked SIMD operations 
+which are not supported yet.
+
+```{code-cell} ipython3
+# configure SIMD vectorization
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX,
+)
+cfg.cpu.vectorize.enable = True
+cfg.cpu.vectorize.assume_inner_stride_one = True
+
+# configure OpenMP parallelization
+cfg.cpu.openmp.enable = True
+cfg.cpu.openmp.num_threads = 8
+
+kernel_cpu_opt = ps.create_kernel(assign_sum, cfg)
+
+ps.inspect(kernel_cpu_opt)
+```
+
+### GPU Platforms
+
+As evident from the generated kernel for the base variant, atomic operations are employed 
+for updating the pointer holding the reduction result.
+Using the *explicit warp-level instructions* provided by CUDA allows us to achieve higher performance compared to
+only using atomic operations.
+To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by 
+the hardware's warp size.
+**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this
+criterion by enabling `assume_warp_aligned_block_size`.**
+While the default block sizes provided by the code generator already fulfill this criterion,
+we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space.
+
+You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.assume_warp_aligned_block_size = True
+
+    kernel_gpu_opt = ps.create_kernel(assign_sum, cfg)
+    
+    kernel_func = kernel_gpu_opt.compile()
+    kernel_func.launch_config.fit_block_size((32, 1, 1))
+
+    ps.inspect(kernel_gpu_opt)
+```
diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 07283d529..329f61d32 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -1,10 +1,6 @@
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
 
-from .codegen import (
-    Target,
-    CreateKernelConfig,
-    AUTO
-)
+from .codegen import Target, CreateKernelConfig, AUTO
 from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
@@ -34,6 +30,13 @@ from .simp import AssignmentCollection
 from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator, tcast
 from .datahandling import create_data_handling
+from .sympyextensions.reduction import (
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+)
 
 __all__ = [
     "Field",
@@ -61,6 +64,11 @@ __all__ = [
     "AssignmentCollection",
     "Assignment",
     "AddAugmentedAssignment",
+    "AddReductionAssignment",
+    "SubReductionAssignment",
+    "MulReductionAssignment",
+    "MinReductionAssignment",
+    "MaxReductionAssignment",
     "assignment_from_stencil",
     "SymbolCreator",
     "create_data_handling",
@@ -81,4 +89,5 @@ __all__ = [
 ]
 
 from . import _version
-__version__ = _version.get_versions()['version']
+
+__version__ = _version.get_versions()["version"]
diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py
index edeba04f2..7032690a0 100644
--- a/src/pystencils/backend/ast/analysis.py
+++ b/src/pystencils/backend/ast/analysis.py
@@ -62,7 +62,7 @@ class UndefinedSymbolsCollector:
 
             case PsAssignment(lhs, rhs):
                 undefined_vars = self(lhs) | self(rhs)
-                if isinstance(lhs, PsSymbolExpr):
+                if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr):
                     undefined_vars.remove(lhs.symbol)
                 return undefined_vars
 
diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 705d25094..d7ae8d6a9 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -3,8 +3,9 @@ from __future__ import annotations
 from typing import cast
 
 from .astnode import PsAstNode
-from .expressions import PsExpression, PsLvalue, PsUnOp
+from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp
 from .util import failing_cast
+from ...sympyextensions import ReductionOp
 
 from ...types import PsVectorType
 
@@ -17,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     """Broadcast a scalar value to N vector lanes."""
 
     __match_args__ = ("lanes", "operand")
-    
+
     def __init__(self, lanes: int, operand: PsExpression):
         super().__init__(operand)
         self._lanes = lanes
@@ -25,26 +26,83 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     @property
     def lanes(self) -> int:
         return self._lanes
-    
+
     @lanes.setter
     def lanes(self, n: int):
         self._lanes = n
 
     def _clone_expr(self) -> PsVecBroadcast:
         return PsVecBroadcast(self._lanes, self._operand.clone())
-    
+
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecBroadcast):
             return False
+        return super().structurally_equal(other) and self._lanes == other._lanes
+
+
+class PsVecHorizontal(PsBinOp, PsVectorOp):
+    """Perform a horizontal reduction across a vector onto a scalar base value.
+
+    **Example:** vec_horizontal_add(s, v)` will compute `s + v[0] + v[1] + ... + v[n-1]`.
+
+    Args:
+        scalar_operand: Scalar operand
+        vector_operand: Vector operand to be converted to a scalar value
+        reduction_op: Binary operation that is also used for the horizontal reduction
+    """
+
+    __match_args__ = ("scalar_operand", "vector_operand", "reduction_op")
+
+    def __init__(
+        self,
+        scalar_operand: PsExpression,
+        vector_operand: PsExpression,
+        reduction_op: ReductionOp,
+    ):
+        super().__init__(scalar_operand, vector_operand)
+        self._reduction_op = reduction_op
+
+    @property
+    def scalar_operand(self) -> PsExpression:
+        return self._op1
+
+    @scalar_operand.setter
+    def scalar_operand(self, op: PsExpression):
+        self._op1 = op
+
+    @property
+    def vector_operand(self) -> PsExpression:
+        return self._op2
+
+    @vector_operand.setter
+    def vector_operand(self, op: PsExpression):
+        self._op2 = op
+
+    @property
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
+
+    @reduction_op.setter
+    def reduction_op(self, op: ReductionOp):
+        self._reduction_op = op
+
+    def _clone_expr(self) -> PsVecHorizontal:
+        return PsVecHorizontal(
+            self._op1.clone(), self._op2.clone(), self._reduction_op
+        )
+
+    def structurally_equal(self, other: PsAstNode) -> bool:
+        if not isinstance(other, PsVecHorizontal):
+            return False
         return (
             super().structurally_equal(other)
-            and self._lanes == other._lanes
+            and self._reduction_op == other._reduction_op
         )
 
 
 class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     """Pointer-based vectorized memory access.
-    
+
     Args:
         base_ptr: Pointer identifying the accessed memory region
         offset: Offset inside the memory region
@@ -95,7 +153,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     @property
     def stride(self) -> PsExpression | None:
         return self._stride
-    
+
     @stride.setter
     def stride(self, expr: PsExpression | None):
         self._stride = expr
@@ -106,10 +164,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
 
     def get_vector_type(self) -> PsVectorType:
         return cast(PsVectorType, self._dtype)
-    
+
     def get_children(self) -> tuple[PsAstNode, ...]:
-        return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,))
-    
+        return (self._ptr, self._offset) + (
+            () if self._stride is None else (self._stride,)
+        )
+
     def set_child(self, idx: int, c: PsAstNode):
         idx = [0, 1, 2][idx]
         match idx:
@@ -138,7 +198,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
             and self._vector_entries == other._vector_entries
             and self._aligned == other._aligned
         )
-    
+
     def __repr__(self) -> str:
         return (
             f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, "
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index ffb65181c..5a3836d50 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR
 
 from ..ast import PsAstNode
 from ..ast.expressions import PsBufferAcc
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
 
 if TYPE_CHECKING:
     from ...codegen import Kernel
@@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel):
 
 class IRAstPrinter(BasePrinter):
     """Print the IR AST as pseudo-code.
-    
+
     This printer produces a complete pseudocode representation of a pystencils AST.
     Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for
     each node defined in `ast <pystencils.backend.ast>`.
@@ -77,6 +77,17 @@ class IRAstPrinter(BasePrinter):
                     f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest
                 )
 
+            case PsVecHorizontal(scalar_operand, vector_operand, reduction_op):
+                pc.push_op(Ops.Weakest, LR.Middle)
+                scalar_operand_code = self.visit(scalar_operand, pc)
+                vector_operand_code = self.visit(vector_operand, pc)
+                pc.pop_op()
+
+                return pc.parenthesize(
+                    f"vec_horizontal_{reduction_op.name.lower()}({scalar_operand_code, vector_operand_code})",
+                    Ops.Weakest,
+                )
+
             case _:
                 return super().visit(node, pc)
 
diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index e815cbf99..a3da9a1de 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -3,9 +3,9 @@ from typing import Any, Sequence, TYPE_CHECKING
 from abc import ABC
 from enum import Enum
 
-from ..types import PsType, PsNumericType
+from ..sympyextensions import ReductionOp
+from ..types import PsType, PsNumericType, PsTypeError
 from .exceptions import PsInternalCompilerError
-from ..types import PsTypeError
 
 if TYPE_CHECKING:
     from .ast.expressions import PsExpression
@@ -70,7 +70,7 @@ class MathFunctions(Enum):
 
 
 class PsMathFunction(PsFunction):
-    """Homogenously typed mathematical functions."""
+    """Homogeneously typed mathematical functions."""
 
     __match_args__ = ("func",)
 
@@ -95,6 +95,33 @@ class PsMathFunction(PsFunction):
         return hash(self._func)
 
 
+class PsReductionWriteBack(PsFunction):
+    """Function representing a reduction kernel's write-back step supported by the backend.
+
+    Each platform has to materialize this function to a concrete implementation.
+    """
+
+    def __init__(self, reduction_op: ReductionOp) -> None:
+        super().__init__("WriteBackToPtr", 2)
+        self._reduction_op = reduction_op
+
+    @property
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
+
+    def __str__(self) -> str:
+        return f"{super().name}"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, PsReductionWriteBack):
+            return False
+
+        return self._reduction_op == other._reduction_op
+
+    def __hash__(self) -> int:
+        return hash(self._reduction_op)
+
+
 class ConstantFunctions(Enum):
     """Numerical constant functions.
 
@@ -122,7 +149,7 @@ class PsConstantFunction(PsFunction):
     and will be broadcast by the vectorizer.
     """
 
-    __match_args__ = ("func,")
+    __match_args__ = ("func",)
 
     def __init__(
         self, func: ConstantFunctions, dtype: PsNumericType | None = None
diff --git a/src/pystencils/backend/kernelcreation/analysis.py b/src/pystencils/backend/kernelcreation/analysis.py
index 1365e1ef3..e5f8b921e 100644
--- a/src/pystencils/backend/kernelcreation/analysis.py
+++ b/src/pystencils/backend/kernelcreation/analysis.py
@@ -13,6 +13,8 @@ from ...simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
 from ..exceptions import PsInternalCompilerError, KernelConstraintsError
+from ...sympyextensions.reduction import ReductionAssignment
+from ...sympyextensions.typed_sympy import TypedSymbol
 
 
 class KernelAnalysis:
@@ -54,6 +56,8 @@ class KernelAnalysis:
         self._check_access_independence = check_access_independence
         self._check_double_writes = check_double_writes
 
+        self._reduction_symbols: set[TypedSymbol] = set()
+
         #   Map pairs of fields and indices to offsets
         self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict(
             set
@@ -88,6 +92,14 @@ class KernelAnalysis:
                 for asm in asms:
                     self._visit(asm)
 
+            case ReductionAssignment():
+                assert isinstance(obj.lhs, TypedSymbol)
+
+                self._reduction_symbols.add(obj.lhs)
+
+                self._handle_rhs(obj.rhs)
+                self._handle_lhs(obj.lhs)
+
             case AssignmentBase():
                 self._handle_rhs(obj.rhs)
                 self._handle_lhs(obj.lhs)
@@ -152,6 +164,11 @@ class KernelAnalysis:
                                     f"{field} is read at {offsets} and written at {write_offset}"
                                 )
                 case sp.Symbol():
+                    if expr in self._reduction_symbols:
+                        raise KernelConstraintsError(
+                            f"Illegal access to reduction symbol {expr.name} outside of ReductionAssignment. "
+                        )
+
                     self._scopes.access_symbol(expr)
 
             for arg in expr.args:
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 68da893ff..319d6061c 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -1,12 +1,16 @@
 from __future__ import annotations
 
+from dataclasses import dataclass
 from typing import Iterable, Iterator, Any
 from itertools import chain, count
 from collections import namedtuple, defaultdict
 import re
 
+from ..ast.expressions import PsExpression, PsConstantExpr, PsCall
+from ..functions import PsConstantFunction, ConstantFunctions
 from ...defaults import DEFAULTS
 from ...field import Field, FieldType
+from ...sympyextensions import ReductionOp
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
 from ..memory import PsSymbol, PsBuffer
@@ -44,6 +48,26 @@ class FieldsInKernel:
 FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 
 
+@dataclass(frozen=True)
+class ReductionInfo:
+    """Information about a reduction operation, its neutral element in form of an initial value
+    and the pointer used by the kernel as write-back argument.
+
+    Attributes:
+    ===========
+
+    reduction_op : Reduction operation being performed
+    init_val : Initial value used to initialize local symbol
+    local_symbol : Kernel-local symbol used to accumulate intermediate reduction result
+    writeback_ptr_symbol : Symbol that is used to export the final reduction result
+    """
+
+    op: ReductionOp
+    init_val: PsExpression
+    local_symbol: PsSymbol
+    writeback_ptr_symbol: PsSymbol
+
+
 class KernelCreationContext:
     """Manages the translation process from the SymPy frontend to the backend AST, and collects
     all necessary information for the translation:
@@ -75,6 +99,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
+        self._reduction_data: dict[str, ReductionInfo] = dict()
+
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
 
@@ -93,7 +119,7 @@ class KernelCreationContext:
     def index_dtype(self) -> PsIntegerType:
         """Data type used by default for index expressions"""
         return self._index_dtype
-    
+
     def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType:
         """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are."""
         match dtype:
@@ -178,6 +204,61 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
+    def add_reduction_info(
+        self,
+        lhs_name: str,
+        lhs_dtype: PsNumericType,
+        reduction_op: ReductionOp,
+    ):
+        """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name."""
+
+        # make sure that lhs symbol never occurred before ReductionAssignment
+        if self.find_symbol(lhs_name):
+            raise KernelConstraintsError(
+                f"Cannot create reduction with symbol {lhs_name}: "
+                "Another symbol with the same name already exist."
+            )
+
+        # add symbol for lhs with pointer datatype for write-back mechanism
+        pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype))
+
+        # create kernel-local copy of lhs symbol
+        local_symb = self.get_new_symbol(f"{lhs_name}_local", lhs_dtype)
+
+        # match for reduction operation and set neutral init_val
+        init_val: PsExpression
+        match reduction_op:
+            case ReductionOp.Add:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Sub:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Mul:
+                init_val = PsConstantExpr(PsConstant(1))
+            case ReductionOp.Min:
+                init_val = PsCall(PsConstantFunction(ConstantFunctions.PosInfinity), [])
+            case ReductionOp.Max:
+                init_val = PsCall(PsConstantFunction(ConstantFunctions.NegInfinity), [])
+            case _:
+                raise PsInternalCompilerError(
+                    f"Unsupported kind of reduction assignment: {reduction_op}."
+                )
+
+        # create reduction info and add to set
+        reduction_info = ReductionInfo(
+            reduction_op, init_val, local_symb, pointer_symb
+        )
+        self._reduction_data[lhs_name] = reduction_info
+
+        return reduction_info
+
+    def find_reduction_info(self, name: str) -> ReductionInfo | None:
+        """Find a ReductionInfo with the given name in the lookup table, if it exists.
+
+        Returns:
+            The ReductionInfo with the given name, or `None` if it does not exist.
+        """
+        return self._reduction_data.get(name, None)
+
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
     ) -> PsSymbol:
@@ -213,6 +294,11 @@ class KernelCreationContext:
         """Return an iterable of all symbols listed in the symbol table."""
         return self._symbols.values()
 
+    @property
+    def reduction_data(self) -> dict[str, ReductionInfo]:
+        """Return a dictionary holding kernel-local reduction information for given symbol names."""
+        return self._reduction_data
+
     #   Fields and Arrays
 
     @property
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 0737d4c74..0fb1bd986 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,9 +1,8 @@
 from typing import overload, cast, Any
 from functools import reduce
-from operator import add, mul, sub, truediv
+from operator import add, mul, sub
 
 import sympy as sp
-import sympy.core.relational
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
@@ -13,8 +12,10 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
+from ..reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
+from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
 from ...sympyextensions.bit_masks import bit_conditional
 from ...field import Field, FieldType
 
@@ -178,19 +179,46 @@ class FreezeExpressions:
         assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
 
-        match expr.op:
-            case "+=":
-                op = add
-            case "-=":
-                op = sub
-            case "*=":
-                op = mul
-            case "/=":
-                op = truediv
-            case _:
-                raise FreezeError(f"Unsupported augmented assignment: {expr.op}.")
+        # transform augmented assignment to reduction op
+        str_to_reduction_op: dict[str, ReductionOp] = {
+            "+=": ReductionOp.Add,
+            "-=": ReductionOp.Sub,
+            "*=": ReductionOp.Mul,
+            "/=": ReductionOp.Div,
+        }
+
+        # reuse existing handling for transforming reduction ops to expressions
+        return PsAssignment(
+            lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs)
+        )
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs))
+    def map_ReductionAssignment(self, expr: ReductionAssignment):
+        assert isinstance(expr.lhs, TypedSymbol)
+
+        rhs = self.visit(expr.rhs)
+
+        assert isinstance(rhs, PsExpression)
+
+        reduction_op = expr.reduction_op
+        lhs_symbol = expr.lhs
+        lhs_dtype = lhs_symbol.dtype
+        lhs_name = lhs_symbol.name
+
+        if not isinstance(lhs_dtype, PsNumericType):
+            raise FreezeError("Reduction symbol must have a numeric data type.")
+
+        # get reduction info from context
+        reduction_info = self._ctx.add_reduction_info(
+            lhs_name, lhs_dtype, reduction_op
+        )
+
+        # create new lhs from newly created local lhs symbol
+        new_lhs = PsSymbolExpr(reduction_info.local_symbol)
+
+        # get new rhs from augmented assignment
+        new_rhs: PsExpression = reduction_op_to_expr(reduction_op, new_lhs, rhs)
+
+        return PsAssignment(new_lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
@@ -563,7 +591,7 @@ class FreezeExpressions:
     def map_Not(self, neg: sympy.logic.Not) -> PsNot:
         arg = self.visit_expr(neg.args[0])
         return PsNot(arg)
-    
+
     def map_bit_conditional(self, conditional: bit_conditional):
         args = [self.visit_expr(arg) for arg in conditional.args]
         bitpos, mask, then_expr = args[:3]
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index e7a19bb50..c966262ea 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -49,7 +49,7 @@ from ..ast.expressions import (
     PsNeg,
     PsNot,
 )
-from ..ast.vector import PsVecBroadcast, PsVecMemAcc
+from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal
 from ..functions import PsMathFunction, CFunction, PsConstantFunction
 from ..ast.util import determine_memory_object
 from ..exceptions import TypificationError
@@ -245,7 +245,7 @@ class TypeContext:
                         f"    Expression: {expr}"
                         f"  Type Context: {self._target_type}"
                     )
-                
+
                 case PsCast(cast_target, _) if cast_target is None:
                     expr.target_type = self._target_type
         # endif
@@ -604,6 +604,38 @@ class Typifier:
                 else:
                     tc.apply_dtype(PsBoolType(), expr)
 
+            case PsVecHorizontal():
+                # bin op consisting of a scalar and a vector that is converted to a scalar
+                # -> whole expression should be treated as scalar
+
+                self.visit_expr(expr.scalar_operand, tc)
+
+                vector_op_tc = TypeContext()
+                self.visit_expr(expr.vector_operand, vector_op_tc)
+
+                if tc.target_type is None or vector_op_tc.target_type is None:
+                    raise TypificationError(
+                        f"Unable to determine type of argument to vector horizontal: {expr}"
+                    )
+
+                if not isinstance(tc.target_type, PsScalarType):
+                    raise TypificationError(
+                        f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}"
+                    )
+
+                if not isinstance(vector_op_tc.target_type, PsVectorType):
+                    raise TypificationError(
+                        f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
+                    )
+
+                if vector_op_tc.target_type.scalar_type is not tc.target_type:
+                    raise TypificationError(
+                        f"Scalar type of vector operand {vector_op_tc.target_type} "
+                        f"does not correspond to type of scalar operand {tc.target_type}"
+                    )
+
+                tc.apply_dtype(tc.target_type, expr)
+
             case PsBinOp(op1, op2):
                 self.visit_expr(op1, tc)
                 self.visit_expr(op2, tc)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 4e5c5f275..6e6488ee1 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,119 @@
 from __future__ import annotations
 
+import math
+
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCall,
+    PsAnd,
+    PsConstantExpr,
+    PsSymbolExpr,
+)
+from ..ast.structural import (
+    PsConditional,
+    PsStatement,
+    PsAssignment,
+    PsBlock,
+    PsStructuralNode,
+)
+from ..constants import PsConstant
+from ..functions import CFunction
+from ..literals import PsLiteral
+from ..reduction_op_mapping import reduction_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType
+from ...types.quick import SInt, UInt
 
 
 class CudaPlatform(GenericGpu):
-    """Platform for the CUDA GPU taret."""
+    """Platform for the CUDA GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
         return super().required_headers | {'"pystencils_runtime/cuda.cuh"'}
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        stype = symbol_expr.dtype
+        ptrtype = ptr_expr.dtype
+
+        assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+        assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+        if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+            NotImplementedError(
+                "atomic operations are only available for float32/64 datatypes"
+            )
+
+        # workaround for subtractions -> use additions for reducing intermediate results
+        # similar to OpenMP reductions: local copies (negative sign) are added at the end
+        match reduction_op:
+            case ReductionOp.Sub:
+                actual_reduction_op = ReductionOp.Add
+            case _:
+                actual_reduction_op = reduction_op
+
+        # check if thread is valid for performing reduction
+        ispace = self._ctx.get_iteration_space()
+        is_valid_thread = self._get_condition_for_translation(ispace)
+
+        cond: PsExpression
+        shuffles: tuple[PsAssignment, ...]
+        if self._warp_size and self._assume_warp_aligned_block_size:
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(
+                    CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                    [
+                        full_mask,
+                        symbol_expr,
+                        PsConstantExpr(PsConstant(offset, SInt(32))),
+                    ],
+                )
+
+            # set up shuffle instructions for warp-level reduction
+            num_shuffles = math.frexp(self._warp_size)[1]
+            shuffles = tuple(
+                PsAssignment(
+                    symbol_expr,
+                    reduction_op_to_expr(
+                        actual_reduction_op,
+                        symbol_expr,
+                        gen_shuffle_instr(pow(2, i - 1)),
+                    ),
+                )
+                for i in reversed(range(1, num_shuffles))
+            )
+
+            # find first thread in warp
+            first_thread_in_warp = self._first_thread_in_warp(ispace)
+
+            # set condition to only execute atomic operation on first valid thread in warp
+            cond = (
+                PsAnd(is_valid_thread, first_thread_in_warp)
+                if is_valid_thread
+                else first_thread_in_warp
+            )
+        else:
+            # no optimization: only execute atomic add on valid thread
+            shuffles = ()
+            cond = is_valid_thread
+
+        # use atomic operation
+        func = CFunction(
+            f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void")
+        )
+        func_args = (ptr_expr, symbol_expr)
+
+        # assemble warp reduction
+        return shuffles, PsConditional(
+            cond, PsBlock([PsStatement(PsCall(func, func_args))])
+        )
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 20167ffd6..cbde419d4 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -2,16 +2,20 @@ from abc import ABC, abstractmethod
 from typing import Sequence
 import numpy as np
 
-from pystencils.backend.ast.expressions import PsCall
+from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
 
+from ..ast import PsAstNode
 from ..functions import (
     CFunction,
-    PsMathFunction,
     MathFunctions,
+    PsMathFunction,
     PsConstantFunction,
     ConstantFunctions,
+    PsReductionWriteBack,
 )
-from ...types import PsIntegerType, PsIeeeFloatType
+from ..reduction_op_mapping import reduction_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
 from .platform import Platform
 from ..exceptions import MaterializationError
@@ -24,7 +28,7 @@ from ..kernelcreation.iteration_space import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsDeclaration, PsLoop, PsBlock
+from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode
 from ..ast.expressions import (
     PsSymbolExpr,
     PsExpression,
@@ -60,11 +64,40 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, (PsMathFunction | PsConstantFunction))
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, (PsReductionWriteBack | PsMathFunction | PsConstantFunction))
+
+        if isinstance(call_func, PsReductionWriteBack):
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
+                ptr_expr.dtype, PsPointerType
+            )
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
+                symbol_expr.dtype, PsScalarType
+            )
+
+            ptr_access = PsMemAcc(
+                ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+            )
+
+            # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+            actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+
+            # create binop and potentially select corresponding function for e.g. min or max
+            potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr)
+            if isinstance(potential_call, PsCall):
+                potential_call.dtype = symbol_expr.dtype
+                return self.select_function(potential_call)
+
+            return potential_call
 
-        func = call.function.func
         dtype = call.get_dtype()
+        func = call_func.func
         arg_types = (dtype,) * call.function.arg_count
 
         expr: PsExpression | None = None
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 60019c4cb..876da4c91 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,14 +1,23 @@
 from __future__ import annotations
+
+import operator
 from abc import ABC, abstractmethod
+from functools import reduce
 import numpy as np
 
-from ...types import constify, deconstify, PsIntegerType
+from ..ast import PsAstNode
+from ...sympyextensions.reduction import ReductionOp
+from ...types import (
+    constify,
+    deconstify,
+    PsIntegerType,
+)
+from ...types.quick import SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
 
 from ..memory import PsSymbol
 from ..kernelcreation import (
-    KernelCreationContext,
     Typifier,
     IterationSpace,
     FullIterationSpace,
@@ -17,7 +26,13 @@ from ..kernelcreation import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..kernelcreation.context import KernelCreationContext
+from ..ast.structural import (
+    PsBlock,
+    PsConditional,
+    PsDeclaration,
+    PsStructuralNode,
+)
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -25,19 +40,24 @@ from ..ast.expressions import (
     PsCall,
     PsLookup,
     PsBufferAcc,
+    PsConstantExpr,
+    PsAdd,
+    PsRem,
+    PsEq,
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
+
 from ..functions import (
-    PsMathFunction,
     MathFunctions,
     CFunction,
+    PsReductionWriteBack,
+    PsMathFunction,
     PsConstantFunction,
     ConstantFunctions,
 )
 
-
 int32 = PsSignedIntegerType(width=32, const=False)
 
 BLOCK_IDX = [
@@ -179,23 +199,38 @@ class GenericGpu(Platform):
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
     """
 
+    @property
+    @abstractmethod
+    def required_headers(self) -> set[str]:
+        return set()
+
+    @abstractmethod
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        pass
+
     def __init__(
         self,
         ctx: KernelCreationContext,
+        assume_warp_aligned_block_size: bool,
+        warp_size: int | None,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+        self._warp_size = warp_size
+
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
 
         self._typify = Typifier(ctx)
 
-    @property
-    def required_headers(self) -> set[str]:
-        return set()
-
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
     ) -> PsBlock:
@@ -206,11 +241,70 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, (PsMathFunction | PsConstantFunction))
+    @staticmethod
+    def _get_condition_for_translation(ispace: IterationSpace):
+
+        if isinstance(ispace, FullIterationSpace):
+            conds = []
+
+            dimensions = ispace.dimensions_in_loop_order()
+
+            for dim in dimensions:
+                ctr_expr = PsExpression.make(dim.counter)
+                conds.append(PsLt(ctr_expr, dim.stop))
+
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+
+            return condition
+        elif isinstance(ispace, SparseIterationSpace):
+            sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+            stop = PsExpression.make(ispace.index_list.shape[0])
+
+            return PsLt(sparse_ctr_expr.clone(), stop)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+    @staticmethod
+    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+        """Returns thread indices multiplied with block dimension strides per dimension."""
+
+        return tuple(
+            idx
+            * PsConstantExpr(
+                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
+            )
+            for i, idx in enumerate(THREAD_IDX[: ispace.rank])
+        )
+
+    def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
+        """Returns expression that determines whether a thread is the first within a warp."""
+
+        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tid: PsExpression = tids_per_dim[0]
+        for t in tids_per_dim[1:]:
+            tid = PsAdd(tid, t)
+
+        return PsEq(
+            PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+            PsConstantExpr(PsConstant(0, SInt(32))),
+        )
+
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, (PsReductionWriteBack | PsMathFunction | PsConstantFunction))
+
+        if isinstance(call_func, PsReductionWriteBack):
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+
+            return self.resolve_reduction(ptr_expr, symbol_expr, op)
 
-        func = call.function.func
         dtype = call.get_dtype()
+        func = call_func.func
         arg_types = (dtype,) * call.function.arg_count
         expr: PsExpression | None = None
 
@@ -273,7 +367,7 @@ class GenericGpu(Platform):
 
                 case ConstantFunctions.PosInfinity:
                     expr = PsExpression.make(PsLiteral(f"PS_FP{dtype.width}_INFINITY", dtype))
-                    
+
                 case ConstantFunctions.NegInfinity:
                     expr = PsExpression.make(PsLiteral(f"PS_FP{dtype.width}_NEG_INFINITY", dtype))
 
@@ -285,7 +379,7 @@ class GenericGpu(Platform):
         if isinstance(dtype, PsIntegerType):
             expr = self._select_integer_function(call)
 
-        if expr is not None:    
+        if expr is not None:
             if expr.dtype is None:
                 typify = Typifier(self._ctx)
                 typify(expr)
@@ -304,7 +398,6 @@ class GenericGpu(Platform):
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
-        conds = []
 
         dimensions = ispace.dimensions_in_loop_order()
 
@@ -318,14 +411,9 @@ class GenericGpu(Platform):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            conds.append(PsLt(ctr_expr, dim.stop))
 
-        condition: PsExpression = conds[0]
-        for cond in conds[1:]:
-            condition = PsAnd(condition, cond)
-        ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock(indexing_decls + [PsConditional(cond, body)])
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -355,8 +443,5 @@ class GenericGpu(Platform):
         ]
         body.statements = mappings + body.statements
 
-        stop = PsExpression.make(ispace.index_list.shape[0])
-        condition = PsLt(sparse_ctr_expr.clone(), stop)
-        ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock([sparse_idx_decl, PsConditional(cond, body)])
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index 6c2c79798..b712ac699 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -1,11 +1,25 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
+from ..ast.expressions import PsExpression
+from ..ast.structural import PsStructuralNode
+from ..exceptions import MaterializationError
+from ...sympyextensions import ReductionOp
 
 
 class HipPlatform(GenericGpu):
-    """Platform for the HIP GPU taret."""
+    """Platform for the HIP GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
         return super().required_headers | {'"pystencils_runtime/hip.h"'}
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+
+        raise MaterializationError("Reductions are yet not supported in HIP backend.")
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index a3e0b6328..f66b15741 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 
+from ..ast import PsAstNode
 from ...types import PsIntegerType
-from ..ast.structural import PsBlock
+from ..ast.structural import PsBlock, PsStructuralNode
 from ..ast.expressions import PsCall, PsExpression, PsTernary, PsGe, PsLe
 from ..functions import PsMathFunction, MathFunctions
 from ..constants import PsConstant
@@ -38,7 +39,9 @@ class Platform(ABC):
         pass
 
     @abstractmethod
-    def select_function(self, call: PsCall) -> PsExpression:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         """Select an implementation for the given function on the given data type.
 
         If no viable implementation exists, raise a `MaterializationError`.
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 4f018a21b..dd043f06b 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
+from ..ast import PsAstNode
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
     IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..ast.structural import PsDeclaration, PsBlock, PsConditional
+from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode
 from ..ast.expressions import (
     PsExpression,
     PsSymbolExpr,
@@ -52,7 +53,9 @@ class SyclPlatform(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         assert isinstance(call.function, PsMathFunction)
 
         func = call.function.func
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 3f0285f0c..16dbc14c4 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
+from typing import Sequence, Tuple
 from enum import Enum
 from functools import cache
 
@@ -17,8 +17,9 @@ from ..ast.expressions import (
     PsCast,
     PsCall,
 )
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
-from ...types import PsCustomType, PsVectorType, PsPointerType
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
+from ...sympyextensions import ReductionOp
+from ...types import PsCustomType, PsVectorType, PsPointerType, PsType
 from ..constants import PsConstant
 
 from ..exceptions import MaterializationError
@@ -132,6 +133,8 @@ class X86VectorCpu(GenericVectorCpu):
         else:
             headers = {"<immintrin.h>"}
 
+        headers.update({'"pystencils_runtime/simd_horizontal_helpers.h"'})
+
         return super().required_headers | headers
 
     def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
@@ -160,7 +163,14 @@ class X86VectorCpu(GenericVectorCpu):
     ) -> PsExpression:
         match expr:
             case PsUnOp() | PsBinOp():
-                func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
+                vtype: PsType
+                if isinstance(expr, PsVecHorizontal):
+                    # return type of expression itself is scalar, but input argument to intrinsic is a vector
+                    vtype = expr.vector_operand.get_dtype()
+                else:
+                    vtype = expr.get_dtype()
+
+                func = _x86_op_intrin(self._vector_arch, expr, vtype)
                 intrinsic = func(*operands)
                 intrinsic.dtype = func.return_type
                 return intrinsic
@@ -339,6 +349,7 @@ def _x86_op_intrin(
     prefix = varch.intrin_prefix(vtype)
     suffix = varch.intrin_suffix(vtype)
     rtype = atype = varch.intrin_type(vtype)
+    atypes: Tuple[PsType, ...] = ()
 
     match op:
         case PsVecBroadcast():
@@ -346,6 +357,16 @@ def _x86_op_intrin(
             if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
                 suffix += "x"
             atype = vtype.scalar_type
+        case PsVecHorizontal():
+            # horizontal add instead of sub avoids double inversion of sign
+            actual_op = (
+                ReductionOp.Add
+                if op.reduction_op == ReductionOp.Sub
+                else op.reduction_op
+            )
+            opstr = f"horizontal_{actual_op.name.lower()}"
+            rtype = vtype.scalar_type
+            atypes = (vtype.scalar_type, vtype)
         case PsAdd():
             opstr = "add"
         case PsSub():
@@ -392,7 +413,9 @@ def _x86_op_intrin(
                 case (SInt(64), Fp()) | (
                     Fp(),
                     SInt(64),
-                ) if varch < X86VectorArch.AVX512:
+                ) if (
+                    varch < X86VectorArch.AVX512
+                ):
                     panic()
                 # AVX512 only: cvtepiA_epiT if A > T
                 case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512:
@@ -408,4 +431,7 @@ def _x86_op_intrin(
             )
 
     num_args = 1 if isinstance(op, PsUnOp) else 2
-    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
+    if not atypes:
+        atypes = (atype,) * num_args
+
+    return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype)
diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
new file mode 100644
index 000000000..a97a496e0
--- /dev/null
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -0,0 +1,24 @@
+from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
+from .exceptions import PsInternalCompilerError
+from .functions import PsMathFunction, MathFunctions
+from ..sympyextensions.reduction import ReductionOp
+
+
+def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
+    match op:
+        case ReductionOp.Add:
+            return PsAdd(op1, op2)
+        case ReductionOp.Sub:
+            return PsSub(op1, op2)
+        case ReductionOp.Mul:
+            return PsMul(op1, op2)
+        case ReductionOp.Div:
+            return PsDiv(op1, op2)
+        case ReductionOp.Min:
+            return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
+        case ReductionOp.Max:
+            return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
+        case _:
+            raise PsInternalCompilerError(
+                f"Found unsupported operation type for reduction assignments: {op}."
+            )
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index bd782422f..434903da0 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -8,7 +8,9 @@ from ..kernelcreation import KernelCreationContext
 from ..ast import PsAstNode
 from ..ast.structural import PsBlock, PsLoop, PsPragma, PsStructuralNode
 from ..ast.expressions import PsExpression
+from ..kernelcreation.context import ReductionInfo
 
+from ...types import PsScalarType
 
 __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"]
 
@@ -103,12 +105,16 @@ class AddOpenMP:
     def __init__(
         self,
         ctx: KernelCreationContext,
+        reductions: Sequence[ReductionInfo] = (),
         nesting_depth: int = 0,
         num_threads: int | None = None,
         schedule: str | None = None,
         collapse: int | None = None,
         omit_parallel: bool = False,
     ) -> None:
+
+        self._reductions = reductions
+
         pragma_text = "omp"
 
         if not omit_parallel:
@@ -122,6 +128,16 @@ class AddOpenMP:
         if num_threads is not None:
             pragma_text += f" num_threads({str(num_threads)})"
 
+        for reduction_info in self._reductions:
+            if isinstance(reduction_info.local_symbol.dtype, PsScalarType):
+                pragma_text += (
+                    f" reduction({reduction_info.op.value}: {reduction_info.local_symbol.name})"
+                )
+            else:
+                NotImplementedError(
+                    "OMP: Reductions for non-scalar data types are not supported yet."
+                )
+
         if collapse is not None:
             if collapse <= 0:
                 raise ValueError(
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index e1e4fea50..b029b95a7 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -1,15 +1,22 @@
 import numpy as np
 from enum import Enum, auto
-from typing import cast, Callable, overload
+from typing import cast, Callable, overload, Sequence
 
+from ..kernelcreation.context import ReductionInfo
 from ...types import PsVectorType, PsScalarType
 
 from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
-from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression, PsTernary, PsGt
-from ..ast.vector import PsVecBroadcast
+from ..ast.structural import (
+    PsLoop,
+    PsBlock,
+    PsDeclaration,
+    PsAssignment,
+    PsStructuralNode,
+)
+from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr
+from ..ast.vector import PsVecBroadcast, PsVecHorizontal
 from ..ast.analysis import collect_undefined_symbols
 
 from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer
@@ -48,10 +55,12 @@ class LoopVectorizer:
         self,
         ctx: KernelCreationContext,
         lanes: int,
+        reductions: Sequence[ReductionInfo] = (),
         trailing_iters: TrailingItersTreatment = TrailingItersTreatment.SCALAR_LOOP,
     ):
         self._ctx = ctx
         self._lanes = lanes
+        self._reductions = reductions
         self._trailing_iters = trailing_iters
 
         from ..kernelcreation import Typifier
@@ -134,6 +143,45 @@ class LoopVectorizer:
         #   Prepare vectorization context
         vc = VectorizationContext(self._ctx, self._lanes, axis)
 
+        #   Prepare reductions found in loop body
+        simd_init_local_reduction_vars: list[PsStructuralNode] = []
+        simd_writeback_local_reduction_vars: list[PsStructuralNode] = []
+        for stmt in loop.body.statements:
+            for reduction_info in self._reductions:
+                match stmt:
+                    case PsAssignment(lhs, _) if (
+                        isinstance(lhs, PsSymbolExpr)
+                        and lhs.symbol is reduction_info.local_symbol
+                    ):
+                        # Vectorize symbol for local copy
+                        local_symbol = lhs.symbol
+                        vector_symb = vc.vectorize_symbol(local_symbol)
+
+                        # Declare and init vector
+                        simd_init_local_reduction_vars += [
+                            self._type_fold(
+                                PsDeclaration(
+                                    PsSymbolExpr(vector_symb),
+                                    PsVecBroadcast(
+                                        self._lanes,
+                                        reduction_info.init_val.clone(),
+                                    ),
+                                )
+                            )
+                        ]
+
+                        # Write back vectorization result
+                        simd_writeback_local_reduction_vars += [
+                            PsAssignment(
+                                PsSymbolExpr(local_symbol),
+                                PsVecHorizontal(
+                                    PsSymbolExpr(local_symbol),
+                                    PsSymbolExpr(vector_symb),
+                                    reduction_info.op,
+                                ),
+                            )
+                        ]
+
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
 
@@ -224,10 +272,10 @@ class LoopVectorizer:
                 )
 
                 return PsBlock(
-                    [
-                        simd_stop_decl,
-                        simd_step_decl,
-                        simd_loop,
+                    simd_init_local_reduction_vars
+                    + [simd_stop_decl, simd_step_decl, simd_loop]
+                    + simd_writeback_local_reduction_vars
+                    + [
                         trailing_start_decl,
                         trailing_loop,
                     ]
@@ -238,11 +286,13 @@ class LoopVectorizer:
 
             case LoopVectorizer.TrailingItersTreatment.NONE:
                 return PsBlock(
-                    [
+                    simd_init_local_reduction_vars
+                    + [
                         simd_stop_decl,
                         simd_step_decl,
                         simd_loop,
                     ]
+                    + simd_writeback_local_reduction_vars
                 )
 
     @overload
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index 1580f9d39..5953bd47d 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,7 +1,9 @@
+from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode
+from ..exceptions import MaterializationError
 from ..platforms import Platform
 from ..ast import PsAstNode
-from ..ast.expressions import PsCall
-from ..functions import PsMathFunction, PsConstantFunction
+from ..ast.expressions import PsCall, PsExpression
+from ..functions import PsMathFunction, PsConstantFunction, PsReductionWriteBack
 
 
 class SelectFunctions:
@@ -17,9 +19,41 @@ class SelectFunctions:
     def visit(self, node: PsAstNode) -> PsAstNode:
         node.children = [self.visit(c) for c in node.children]
 
-        if isinstance(node, PsCall) and isinstance(
-            node.function, (PsMathFunction | PsConstantFunction)
+        if isinstance(node, PsAssignment):
+            rhs = node.rhs
+            if isinstance(rhs, PsCall) and isinstance(
+                rhs.function, PsReductionWriteBack
+            ):
+                resolved_func = self._platform.select_function(rhs)
+
+                match resolved_func:
+                    case (prepend, new_rhs):
+                        assert isinstance(prepend, tuple)
+
+                        match new_rhs:
+                            case PsExpression():
+                                return PsBlock(
+                                    prepend + (PsAssignment(node.lhs, new_rhs),)
+                                )
+                            case PsStructuralNode():
+                                # special case: produces structural with atomic operation writing value back to ptr
+                                return PsBlock(prepend + (new_rhs,))
+                            case _:
+                                assert False, "Unexpected output from SelectFunctions."
+                    case PsExpression():
+                        return PsAssignment(node.lhs, resolved_func)
+                    case _:
+                        raise MaterializationError(
+                            f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions."
+                        )
+            else:
+                return node
+        elif isinstance(node, PsCall) and isinstance(
+                node.function, (PsMathFunction | PsConstantFunction)
         ):
-            return self._platform.select_function(node)
+            resolved_func = self._platform.select_function(node)
+            assert isinstance(resolved_func, PsExpression)
+
+            return resolved_func
         else:
             return node
diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py
index 060192810..b20614393 100644
--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement
 from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral
 from ...types import PsCustomType, PsVectorType, constify, deconstify
 from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp
-from ..ast.vector import PsVecMemAcc
+from ..ast.vector import PsVecMemAcc, PsVecHorizontal
 from ..exceptions import MaterializationError
 from ..functions import CFunction, PsMathFunction
 
@@ -86,6 +86,10 @@ class SelectIntrinsics:
                 new_rhs = self.visit_expr(rhs, sc)
                 return PsStatement(self._platform.vector_store(lhs, new_rhs))
 
+            case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal):
+                new_rhs = self.visit_expr(rhs, sc)
+                return PsAssignment(lhs, new_rhs)
+
             case _:
                 node.children = [self.visit(c, sc) for c in node.children]
 
@@ -93,7 +97,15 @@ class SelectIntrinsics:
 
     def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression:
         if not isinstance(expr.dtype, PsVectorType):
-            return expr
+            # special case: result type of horizontal reduction is scalar
+            if isinstance(expr, PsVecHorizontal):
+                scalar_op = expr.scalar_operand
+                vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc)
+                return self._platform.op_intrinsic(
+                    expr, [scalar_op, vector_op_to_scalar]
+                )
+            else:
+                return expr
 
         match expr:
             case PsSymbolExpr(symb):
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index e9fc69b76..d9531b33c 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -19,14 +19,22 @@ from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
 from .functions import Lambda
 from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
+from ..backend.kernelcreation.context import ReductionInfo
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop
-from ..backend.ast.expressions import PsExpression
+from ..backend.functions import PsReductionWriteBack
+from ..backend.ast.expressions import (
+    PsExpression,
+    PsSymbolExpr,
+    PsCall,
+    PsMemAcc,
+    PsConstantExpr,
+)
+from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment, PsStructuralNode
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -183,6 +191,10 @@ class DefaultKernelCreationDriver:
         if self._intermediates is not None:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
+        #   Extensions for reductions
+        for _, reduction_info in self._ctx.reduction_data.items():
+            self._modify_kernel_ast_for_reductions(reduction_info, kernel_ast)
+
         #   Target-Specific optimizations
         if self._target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
@@ -283,6 +295,31 @@ class DefaultKernelCreationDriver:
 
         return kernel_body
 
+    def _modify_kernel_ast_for_reductions(self,
+                                          reduction_info: ReductionInfo,
+                                          kernel_ast: PsBlock):
+        # typify local symbol and write-back pointer expressions and initial value
+        typify = Typifier(self._ctx)
+        symbol_expr = typify(PsSymbolExpr(reduction_info.local_symbol))
+        ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.writeback_ptr_symbol))
+        init_val = typify(reduction_info.init_val)
+
+        ptr_access = PsMemAcc(
+            ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+        )
+        write_back_ptr = PsCall(
+            PsReductionWriteBack(reduction_info.op),
+            [ptr_symbol_expr, symbol_expr],
+        )
+
+        # declare and init local copy with neutral element
+        prepend_ast: list[PsStructuralNode] = [PsDeclaration(symbol_expr, init_val)]
+        # write back result to reduction target variable
+        append_ast: list[PsStructuralNode] = [PsAssignment(ptr_access, write_back_ptr)]
+
+        # modify AST
+        kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast
+
     def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock:
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
@@ -321,6 +358,7 @@ class DefaultKernelCreationDriver:
 
             add_omp = AddOpenMP(
                 self._ctx,
+                reductions=list(self._ctx.reduction_data.values()),
                 nesting_depth=omp_options.get_option("nesting_depth"),
                 num_threads=omp_options.get_option("num_threads"),
                 schedule=omp_options.get_option("schedule"),
@@ -381,7 +419,8 @@ class DefaultKernelCreationDriver:
                 self._target, cast(PsScalarType, self._ctx.default_dtype)
             )
 
-        vectorizer = LoopVectorizer(self._ctx, num_lanes)
+        vectorizer = LoopVectorizer(self._ctx, num_lanes,
+                                    list(self._ctx.reduction_data.values()))
 
         def loop_predicate(loop: PsLoop):
             return loop.counter.symbol == inner_loop_dim.counter
@@ -405,14 +444,18 @@ class DefaultKernelCreationDriver:
 
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
-        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+            "assume_warp_aligned_block_size"
+        )
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
         if warp_size is None and assume_warp_aligned_block_size:
-            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
+            warn(
+                "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`."
+            )
 
         return GpuIndexing(
             self._ctx,
@@ -457,6 +500,11 @@ class DefaultKernelCreationDriver:
                 else None
             )
 
+            assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+                "assume_warp_aligned_block_size"
+            )
+            warp_size: int | None = self._cfg.gpu.get_option("warp_size")
+
             GpuPlatform: type
             match self._target:
                 case Target.CUDA:
@@ -468,6 +516,8 @@ class DefaultKernelCreationDriver:
 
             return GpuPlatform(
                 self._ctx,
+                assume_warp_aligned_block_size,
+                warp_size,
                 thread_mapping=thread_mapping,
             )
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 43b612bd7..f5606da02 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
+        rank: int,
         num_work_items: _Dim3Lambda,
         hw_props: HardwareProperties,
         assume_warp_aligned_block_size: bool,
@@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        default_bs = GpuLaunchConfiguration.get_default_block_size(rank)
         self._default_block_size = default_bs
         self._init_block_size: dim3 = default_bs
         self._compute_block_size: (
@@ -598,6 +599,7 @@ class GpuIndexing:
 
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
+                rank,
                 num_work_items,
                 self._hw_props,
                 self._assume_warp_aligned_block_size,
diff --git a/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h b/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h
new file mode 100644
index 000000000..6de5c3321
--- /dev/null
+++ b/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h
@@ -0,0 +1,90 @@
+#pragma once
+
+// No direct implementation for all atomic operations available
+// -> add support by custom implementations using a CAS mechanism
+
+#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__)
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+__device__ double atomicMul(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int oldValue = *address_as_ull, assumed;
+    do {
+      assumed = oldValue;
+      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
+                           __longlong_as_double(assumed)));
+    } while (assumed != oldValue);
+
+    return __longlong_as_double(oldValue);
+}
+
+__device__ float atomicMul(float* address, float val) {
+    int* address_as_int = (int*)address;
+    int old = *address_as_int;
+    int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
+    } while (assumed != old);
+
+    return __int_as_float(old);
+}
+
+#endif
+
+#ifdef __CUDA_ARCH__
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/pystencils/include/pystencils_runtime/cuda.cuh b/src/pystencils/include/pystencils_runtime/cuda.cuh
index 6a22e0b90..691519a01 100644
--- a/src/pystencils/include/pystencils_runtime/cuda.cuh
+++ b/src/pystencils/include/pystencils_runtime/cuda.cuh
@@ -3,3 +3,4 @@
 #include <cuda_fp16.h>
 
 #include "./bits/gpu_infinities.h"
+#include "./bits/gpu_atomics.h"
diff --git a/src/pystencils/include/pystencils_runtime/hip.h b/src/pystencils/include/pystencils_runtime/hip.h
index 10084103a..32d5e84c6 100644
--- a/src/pystencils/include/pystencils_runtime/hip.h
+++ b/src/pystencils/include/pystencils_runtime/hip.h
@@ -3,6 +3,7 @@
 #include <hip/hip_fp16.h>
 
 #include "./bits/gpu_infinities.h"
+#include "./bits/gpu_atomics.h"
 
 #ifdef __HIPCC_RTC__
 typedef __hip_uint8_t uint8_t;
diff --git a/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h b/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h
new file mode 100644
index 000000000..bd1889153
--- /dev/null
+++ b/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <cmath>
+
+#if defined(__SSE3__)
+#include <immintrin.h>
+
+inline double _mm_horizontal_add_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v));
+}
+
+inline float _mm_horizontal_add_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_hadd_ps(_v, _v);
+	return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h)));
+}
+
+inline double _mm_horizontal_mul_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return dst * _r;
+}
+
+inline float _mm_horizontal_mul_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm_horizontal_min_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmin(_r, dst);
+}
+
+inline float _mm_horizontal_min_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm_horizontal_max_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmax(_r, dst);
+}
+
+inline float _mm_horizontal_max_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX__)
+#include <immintrin.h>
+
+inline double _mm256_horizontal_add_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m256d _h = _mm256_hadd_pd(_v, _v);
+	return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h)));
+}
+
+inline float _mm256_horizontal_add_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m256 _h = _mm256_hadd_ps(_v, _v);
+	__m128  _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h));
+	return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i));
+}
+
+inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); 
+	return dst * _r;
+}
+
+inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm256_horizontal_min_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmin(_r, dst);
+}
+
+inline float _mm256_horizontal_min_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm256_horizontal_max_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmax(_r, dst);
+}
+
+inline float _mm256_horizontal_max_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+
+inline double _mm512_horizontal_add_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_add_pd(src);
+	return dst + _r;
+}
+
+inline float _mm512_horizontal_add_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_add_ps(src);
+	return dst + _r;
+}
+
+inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_mul_pd(src);
+	return dst * _r;
+}
+
+inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_mul_ps(src);
+	return dst * _r;
+}
+
+inline double _mm512_horizontal_min_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_min_pd(src);
+	return fmin(_r, dst);
+}
+
+inline float _mm512_horizontal_min_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_min_ps(src);
+	return fmin(_r, dst);
+}
+
+inline double _mm512_horizontal_max_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_max_pd(src);
+	return fmax(_r, dst);
+}
+
+inline float _mm512_horizontal_max_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_max_ps(src);
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(_M_ARM64)
+#include <arm_neon.h>
+
+inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r += vgetq_lane_f64(_v,1);
+	return dst + _r;
+}
+
+inline float vget_horizontal_add_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r += vget_lane_f32(_w,1);
+	return dst + _r;
+}
+
+inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r *= vgetq_lane_f64(_v,1);
+	return dst * _r;
+}
+
+inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r *= vget_lane_f32(_w,1);
+	return dst * _r;
+}
+
+inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmin(_r, vgetq_lane_f64(_v,1));
+	return fmin(_r, dst);
+}
+
+inline float vget_horizontal_min_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmin(_r, vget_lane_f32(_w,1));
+	return fmin(_r, dst);
+}
+
+inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmax(_r, vgetq_lane_f64(_v,1));
+	return fmax(_r, dst);
+}
+
+inline float vget_horizontal_max_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmax(_r, vget_lane_f32(_w,1));
+	return fmax(_r, dst);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index fca043db9..4d76ea9ca 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -92,6 +92,7 @@ class PsKernelExtensioNModule:
 
         #   Kernels and call wrappers
         from ..backend.emission import CAstPrinter
+
         printer = CAstPrinter(func_prefix="FUNC_PREFIX")
 
         for name, kernel in self._kernels.items():
@@ -200,13 +201,15 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 """
 
     def __init__(self) -> None:
-        self._buffer_types: dict[Field, PsType] = dict()
-        self._array_extractions: dict[Field, str] = dict()
-        self._array_frees: dict[Field, str] = dict()
+        self._buffer_types: dict[Any, PsType] = dict()
+        self._array_extractions: dict[Any, str] = dict()
+        self._array_frees: dict[Any, str] = dict()
 
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
 
+        self._pointer_extractions: dict[Parameter, str] = dict()
+
         self._constraint_checks: list[str] = []
 
         self._call: str | None = None
@@ -232,38 +235,42 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         else:
             return None
 
+    def get_buffer(self, buffer_name: str) -> str:
+        """Get the Python buffer object for a given buffer name."""
+        return f"buffer_{buffer_name}"
+
     def get_field_buffer(self, field: Field) -> str:
         """Get the Python buffer object for the given field."""
-        return f"buffer_{field.name}"
+        return self.get_buffer(field.name)
 
-    def extract_field(self, field: Field) -> None:
-        """Add the necessary code to extract the NumPy array for a given field"""
-        if field not in self._array_extractions:
-            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name)
-            actual_dtype = self._buffer_types[field]
+    def extract_buffer(self, buffer: Any, name: str) -> None:
+        """Add the necessary code to extract the NumPy array for a given buffer"""
+        if buffer not in self._array_extractions:
+            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name)
+            actual_dtype = self._buffer_types[buffer]
 
             #   Check array type
             type_char = self._type_char(actual_dtype)
             if type_char is not None:
-                dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'"
+                dtype_cond = f"buffer_{name}.format[0] == '{type_char}'"
                 extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
                     cond=dtype_cond,
                     what="data type",
-                    name=field.name,
+                    name=name,
                     expected=str(actual_dtype),
                 )
 
             #   Check item size
             itemsize = actual_dtype.itemsize
-            item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}"
+            item_size_cond = f"buffer_{name}.itemsize == {itemsize}"
             extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
-                cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize
+                cond=item_size_cond, what="itemsize", name=name, expected=itemsize
             )
 
-            self._array_extractions[field] = extraction_code
+            self._array_extractions[buffer] = extraction_code
 
-            release_code = f"PyBuffer_Release(&buffer_{field.name});"
-            self._array_frees[field] = release_code
+            release_code = f"PyBuffer_Release(&buffer_{name});"
+            self._array_frees[buffer] = release_code
 
     def extract_scalar(self, param: Parameter) -> str:
         if param not in self._scalar_extractions:
@@ -277,6 +284,26 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
+    def extract_ptr(self, param: Parameter) -> str:
+        if param not in self._pointer_extractions:
+            ptr = param.symbol
+            ptr_dtype = ptr.dtype
+
+            assert isinstance(ptr_dtype, PsPointerType)
+
+            self._buffer_types[ptr] = ptr_dtype.base_type
+            self.extract_buffer(ptr, param.name)
+            buffer = self.get_buffer(param.name)
+            code = (
+                f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+            )
+
+            assert code is not None
+
+            self._array_assoc_var_extractions[param] = code
+
+        return param.name
+
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
@@ -321,11 +348,13 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
                     actual_field_type = field.dtype
 
                 self._buffer_types[prop.field] = actual_field_type
-                self.extract_field(prop.field)
+                self.extract_buffer(prop.field, field.name)
 
         for param in params:
             if param.is_field_parameter:
                 self.extract_array_assoc_var(param)
+            elif isinstance(param.dtype, PsPointerType):
+                self.extract_ptr(param)
             else:
                 self.extract_scalar(param)
 
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 7d2fe0991..893589a01 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -11,7 +11,6 @@ except ImportError:
 from ..codegen import Target
 from ..field import FieldType
 
-from ..types import PsType
 from .jit import JitBase, JitError, KernelWrapper
 from ..codegen import (
     Kernel,
@@ -20,7 +19,7 @@ from ..codegen import (
 )
 from ..codegen.gpu_indexing import GpuLaunchConfiguration
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
-from ..types import PsStructType, PsPointerType
+from ..types import PsType, PsStructType, PsPointerType
 
 from ..include import get_pystencils_include_path
 
@@ -186,10 +185,13 @@ class CupyKernelWrapper(KernelWrapper):
                                 arr.strides[coord] // arr.dtype.itemsize,
                             )
                             break
+            elif isinstance(kparam.dtype, PsPointerType):
+                val = kwargs[kparam.name]
+                kernel_args.append(val)
             else:
                 #   scalar parameter
-                val: Any = kwargs[param.name]
-                adder(param, val)
+                val = kwargs[kparam.name]
+                adder(kparam, val)
 
         #   Process Arguments
 
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index f1ba87154..03b4edccf 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,5 +1,6 @@
 import itertools
 from copy import copy
+
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 3933fd9b3..8c7d10a80 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType
 from .pointers import mem_acc
+from .reduction import reduction_assignment, ReductionOp
 from .bit_masks import bit_conditional
 
 from .math import (
@@ -28,12 +29,14 @@ from .math import (
     count_operations_in_ast,
     common_denominator,
     get_symmetric_part,
-    SymbolCreator
+    SymbolCreator,
 )
 
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduction_assignment",
+    "ReductionOp",
     "TypedSymbol",
     "CastFunc",
     "tcast",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
new file mode 100644
index 000000000..a1a9a026c
--- /dev/null
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -0,0 +1,82 @@
+from enum import Enum
+
+from sympy.codegen.ast import AssignmentBase
+
+from . import TypedSymbol
+
+
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Div = "/"
+    Min = "min"
+    Max = "max"
+
+
+class ReductionAssignment(AssignmentBase):
+    """
+    Base class for reduced assignments.
+
+    Attributes:
+    ===========
+
+    reduction_op : ReductionOp
+       Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
+    """
+
+    _reduction_op = None  # type: ReductionOp
+
+    @property
+    def reduction_op(self):
+        return self._reduction_op
+
+    @reduction_op.setter
+    def reduction_op(self, op):
+        self._reduction_op = op
+
+    @classmethod
+    def _check_args(cls, lhs, rhs):
+        super()._check_args(lhs, rhs)
+
+        if not isinstance(lhs, TypedSymbol):
+            raise TypeError(f"lhs of needs to be a TypedSymbol. Got {type(lhs)} instead.")
+
+
+class AddReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Add
+
+
+class SubReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Sub
+
+
+class MulReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Mul
+
+
+class MinReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Min
+
+
+class MaxReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Max
+
+
+# Mapping from ReductionOp enum to ReductionAssigment classes
+_reduction_assignment_classes = {
+    cls.reduction_op: cls
+    for cls in [
+        AddReductionAssignment,
+        SubReductionAssignment,
+        MulReductionAssignment,
+        MinReductionAssignment,
+        MaxReductionAssignment,
+    ]
+}
+
+
+def reduction_assignment(lhs, op: ReductionOp, rhs):
+    if op not in _reduction_assignment_classes:
+        raise ValueError("Unrecognized operator %s" % op)
+    return _reduction_assignment_classes[op](lhs, rhs)
diff --git a/tests/frontend/test_sympyextensions.py b/tests/frontend/test_sympyextensions.py
index ad5d2513b..152527441 100644
--- a/tests/frontend/test_sympyextensions.py
+++ b/tests/frontend/test_sympyextensions.py
@@ -3,7 +3,9 @@ import numpy as np
 import sympy as sp
 import pystencils
 
-from pystencils import Assignment
+import pytest
+
+from pystencils import Assignment, TypedSymbol
 from pystencils.sympyextensions import replace_second_order_products
 from pystencils.sympyextensions import remove_higher_order_terms
 from pystencils.sympyextensions import complete_the_squares_in_exp
@@ -27,6 +29,16 @@ from pystencils.sympyextensions.integer_functions import (
     div_ceil,
 )
 
+from pystencils.sympyextensions.reduction import (
+    ReductionOp,
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+    reduction_assignment,
+)
+
 
 def test_utility():
     a = [1, 2]
@@ -199,6 +211,30 @@ def test_count_operations():
     assert ops["muls"] == 99
 
 
+@pytest.mark.parametrize("reduction_assignment_for_op", [
+    (ReductionOp.Add, AddReductionAssignment),
+    (ReductionOp.Sub, SubReductionAssignment),
+    (ReductionOp.Mul, MulReductionAssignment),
+    (ReductionOp.Min, MinReductionAssignment),
+    (ReductionOp.Max, MaxReductionAssignment),
+])
+def test_reduction_assignments(
+        reduction_assignment_for_op
+):
+    reduction_op, reduction_assignment_type = reduction_assignment_for_op
+
+    w = TypedSymbol("w", "float64")
+    v = sympy.symbols("v")
+
+    assignment = reduction_assignment(w, reduction_op, 0)
+
+    assert isinstance(assignment, reduction_assignment_type)
+
+    # invalid assignment since v is not a typed symbol
+    with pytest.raises(TypeError):
+        _ = reduction_assignment(v, reduction_op, 0)
+
+
 def test_common_denominator():
     x = sympy.symbols("x")
     expr = sympy.Rational(1, 2) + x * sympy.Rational(2, 3)
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
new file mode 100644
index 000000000..1fb8efc81
--- /dev/null
+++ b/tests/kernelcreation/test_reduction.py
@@ -0,0 +1,93 @@
+import pytest
+import numpy as np
+
+import pystencils as ps
+from pystencils.sympyextensions import ReductionOp, reduction_assignment
+
+INIT_W = 5
+INIT_ARR = 2
+SIZE = 15
+SOLUTION = {
+    ReductionOp.Add: INIT_W + INIT_ARR * SIZE,
+    ReductionOp.Sub: INIT_W - INIT_ARR * SIZE,
+    ReductionOp.Mul: INIT_W * INIT_ARR**SIZE,
+    ReductionOp.Min: min(INIT_W, INIT_ARR),
+    ReductionOp.Max: max(INIT_W, INIT_ARR),
+}
+
+
+# get AST for kernel with reduction assignment
+def get_reduction_assign_ast(dtype, op, config):
+    x = ps.fields(f"x: {dtype}[1d]")
+    w = ps.TypedSymbol("w", dtype)
+
+    red_assign = reduction_assignment(w, op, x.center())
+
+    return ps.create_kernel([red_assign], config, default_dtype=dtype)
+
+
+@pytest.mark.parametrize("instruction_set", ["sse", "avx"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max])
+def test_reduction_cpu(instruction_set, dtype, op):
+    vectorize_info = {
+        "instruction_set": instruction_set,
+        "assume_inner_stride_one": True,
+    }
+
+    config = ps.CreateKernelConfig(
+        target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info
+    )
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ps.show_code(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+
+    kernel_reduction(x=array, w=reduction_array)
+    assert np.allclose(reduction_array, SOLUTION[op])
+
+
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_reduction_gpu(
+        dtype: str,
+        op: str,
+        assume_warp_aligned_block_size: bool,
+        use_block_fitting: bool,
+):
+    try:
+        import cupy as cp
+        from cupy_backends.cuda.api.runtime import CUDARuntimeError
+
+        device_count = range(cp.cuda.runtime.getDeviceCount())
+        print(f"Found {device_count} GPUs")
+    except ImportError:
+        pytest.skip(reason="CuPy is not available", allow_module_level=True)
+    except CUDARuntimeError:
+        pytest.skip(
+            reason="No CUDA capable device is detected", allow_module_level=True
+        )
+
+    cfg = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
+    ps.show_code(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    if use_block_fitting:
+        kernel_reduction.launch_config.fit_block_size((32, 1, 1))
+
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+
+    array_gpu = cp.asarray(array)
+    reduction_array_gpu = cp.asarray(reduction_array)
+
+    kernel_reduction(x=array_gpu, w=reduction_array_gpu)
+    assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
diff --git a/tests/nbackend/kernelcreation/test_analysis.py b/tests/nbackend/kernelcreation/test_analysis.py
new file mode 100644
index 000000000..d68c0a5f3
--- /dev/null
+++ b/tests/nbackend/kernelcreation/test_analysis.py
@@ -0,0 +1,38 @@
+import pytest
+
+from pystencils import fields, TypedSymbol, AddReductionAssignment, Assignment, KernelConstraintsError
+from pystencils.backend.kernelcreation import KernelCreationContext, KernelAnalysis
+from pystencils.sympyextensions import mem_acc
+from pystencils.types.quick import Ptr, Fp
+
+
+def test_invalid_reduction_symbol_reassign():
+    dtype = Fp(64)
+    ctx = KernelCreationContext(default_dtype=dtype)
+    analysis = KernelAnalysis(ctx)
+
+    x = fields(f"x: [1d]")
+    w = TypedSymbol("w", dtype)
+
+    # illegal reassign to already locally defined symbol (here: reduction symbol)
+    with pytest.raises(KernelConstraintsError):
+        analysis([
+            AddReductionAssignment(w, 3 * x.center()),
+            Assignment(w, 0)
+        ])
+
+def test_invalid_reduction_symbol_reference():
+    dtype = Fp(64)
+    ctx = KernelCreationContext(default_dtype=dtype)
+    analysis = KernelAnalysis(ctx)
+
+    x = fields(f"x: [1d]")
+    v = TypedSymbol("v", dtype)
+    w = TypedSymbol("w", dtype)
+
+    # do not allow reduction symbol to be referenced on rhs of other assignments
+    with pytest.raises(KernelConstraintsError):
+        analysis([
+            AddReductionAssignment(w, 3 * x.center()),
+            Assignment(v, w)
+        ])
\ No newline at end of file
diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index 5384c90f9..679a7cebc 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -8,6 +8,7 @@ from pystencils import (
     create_numeric_type,
     TypedSymbol,
     DynamicType,
+    KernelConstraintsError,
 )
 from pystencils.sympyextensions import tcast, bit_conditional
 from pystencils.sympyextensions.pointers import mem_acc
@@ -44,6 +45,7 @@ from pystencils.backend.ast.expressions import (
     PsArrayInitList,
     PsSubscript,
     PsMemAcc,
+    PsSymbolExpr,
 )
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.functions import PsMathFunction, MathFunctions, PsConstantFunction, ConstantFunctions
@@ -66,6 +68,14 @@ from pystencils.sympyextensions.integer_functions import (
     ceil_to_multiple,
     div_ceil,
 )
+from pystencils.sympyextensions.reduction import (
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+)
+from pystencils.types import PsTypeError
 
 
 def test_freeze_simple():
@@ -519,6 +529,79 @@ def test_invalid_arrays():
         _ = freeze(symb_arr)
 
 
+@pytest.mark.parametrize("reduction_assignment_rhs_type",
+                         [
+                             (AddReductionAssignment, PsAdd),
+                             (SubReductionAssignment, PsSub),
+                             (MulReductionAssignment, PsMul),
+                             (MinReductionAssignment, PsCall),
+                             (MaxReductionAssignment, PsCall),
+                         ])
+def test_reduction_assignments(
+        reduction_assignment_rhs_type
+):
+    x = fields(f"x: float64[1d]")
+    w = TypedSymbol("w", "float64")
+
+    reduction_op, rhs_type = reduction_assignment_rhs_type
+
+    ctx = KernelCreationContext()
+    freeze = FreezeExpressions(ctx)
+
+    one = PsExpression.make(PsConstant(1, ctx.index_dtype))
+    counter = ctx.get_symbol("ctr", ctx.index_dtype)
+    ispace = FullIterationSpace(
+        ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
+    )
+    ctx.set_iteration_space(ispace)
+
+    expr = freeze(reduction_op(w, 3 * x.center()))
+
+    info = ctx.find_reduction_info(w.name)
+
+    assert isinstance(expr, PsAssignment)
+    assert isinstance(expr.lhs, PsSymbolExpr)
+
+    assert expr.lhs.symbol == info.local_symbol
+    assert expr.lhs.dtype == w.dtype
+
+    assert isinstance(expr.rhs, rhs_type)
+
+
+def test_invalid_reduction_assignments():
+    x = fields(f"x: float64[1d]")
+    w = TypedSymbol("w", "float64")
+
+    assignment = Assignment(w, -1 * x.center())
+    reduction_assignment = AddReductionAssignment(w, 3 * x.center())
+
+    expected_errors_for_invalid_cases = [
+        # 1) Reduction symbol is used before ReductionAssignment.
+        #    May only be used for reductions -> KernelConstraintsError
+        ([assignment, reduction_assignment], KernelConstraintsError),
+        # 2) Reduction symbol is used after ReductionAssignment.
+        #    Reduction symbol is converted to pointer after freeze -> PsTypeError
+        ([reduction_assignment, assignment], PsTypeError),
+        # 3) Duplicate ReductionAssignment
+        #    May only be used once for now -> KernelConstraintsError
+        ([reduction_assignment, reduction_assignment], KernelConstraintsError)
+    ]
+
+    for invalid_assignment, error_class in expected_errors_for_invalid_cases:
+        ctx = KernelCreationContext()
+        freeze = FreezeExpressions(ctx)
+
+        one = PsExpression.make(PsConstant(1, ctx.index_dtype))
+        counter = ctx.get_symbol("ctr", ctx.index_dtype)
+        ispace = FullIterationSpace(
+            ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
+        )
+        ctx.set_iteration_space(ispace)
+
+        with pytest.raises(error_class):
+            _ = [freeze(asm) for asm in invalid_assignment]
+
+
 def test_memory_access():
     ctx = KernelCreationContext()
     freeze = FreezeExpressions(ctx)
diff --git a/tests/nbackend/kernelcreation/test_typification.py b/tests/nbackend/kernelcreation/test_typification.py
index d5b204e9f..62b0106b9 100644
--- a/tests/nbackend/kernelcreation/test_typification.py
+++ b/tests/nbackend/kernelcreation/test_typification.py
@@ -5,6 +5,7 @@ import numpy as np
 from typing import cast
 
 from pystencils import Assignment, TypedSymbol, Field, FieldType, AddAugmentedAssignment
+from pystencils.sympyextensions import ReductionOp
 from pystencils.sympyextensions.pointers import mem_acc
 
 from pystencils.backend.ast.structural import (
@@ -34,9 +35,9 @@ from pystencils.backend.ast.expressions import (
     PsTernary,
     PsMemAcc
 )
+from pystencils.backend.ast.vector import PsVecBroadcast, PsVecHorizontal
 from pystencils.backend.ast import dfs_preorder
 from pystencils.backend.ast.expressions import PsAdd
-from pystencils.backend.ast.vector import PsVecBroadcast
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.functions import CFunction, PsConstantFunction, ConstantFunctions
 from pystencils.types import constify, create_type, create_numeric_type, PsVectorType, PsTypeError
@@ -744,6 +745,50 @@ def test_propagate_constant_type_in_broadcast():
         assert expr.dtype == PsVectorType(fp16, 4)
 
 
+def test_typify_horizontal_vector_reductions():
+    ctx = KernelCreationContext()
+    typify = Typifier(ctx)
+
+    reduction_op = ReductionOp.Add
+    stype = Fp(32)
+    vtype = PsVectorType(stype, 4)
+
+    def create_symb_expr(name, tpe):
+        return PsExpression.make(ctx.get_symbol(name, tpe))
+
+    # create valid horizontal and check if expression type is scalar
+    result = typify(
+        PsVecHorizontal(
+            create_symb_expr("s1", stype), create_symb_expr("v1", vtype), ReductionOp.Add
+        )
+    )
+    assert result.get_dtype() == stype
+
+    # create invalid horizontal by using scalar type for expected vector type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s2", stype), create_symb_expr("v2", stype), reduction_op
+            )
+        )
+
+    # create invalid horizontal by using vector type for expected scalar type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s3", vtype), create_symb_expr("v3", vtype), reduction_op
+            )
+        )
+
+    # create invalid horizontal where base type of vector does not match with scalar type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s4", Int(32)), create_symb_expr("v4", vtype), reduction_op
+            )
+        )
+
+
 def test_inference_fails():
     ctx = KernelCreationContext()
     typify = Typifier(ctx)
diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py
new file mode 100644
index 000000000..1d652c6e1
--- /dev/null
+++ b/util/generate_simd_horizontal_op.py
@@ -0,0 +1,309 @@
+from enum import Enum
+
+FCT_QUALIFIERS = "inline"
+
+
+class InstructionSets(Enum):
+    SSE3 = "SSE3"
+    AVX = "AVX"
+    AVX512 = "AVX512"
+    NEON = "NEON"
+
+    def __str__(self):
+        return self.value
+
+
+class ReductionOps(Enum):
+    Add = ("add", "+")
+    Mul = ("mul", "*")
+    Min = ("min", "min")
+    Max = ("max", "max")
+
+    def __init__(self, op_name, op_str):
+        self.op_name = op_name
+        self.op_str = op_str
+
+
+class ScalarTypes(Enum):
+    Double = "double"
+    Float = "float"
+
+    def __str__(self):
+        return self.value
+
+
+class VectorTypes(Enum):
+    SSE3_128d = "__m128d"
+    SSE3_128 = "__m128"
+
+    AVX_256d = "__m256d"
+    AVX_256 = "__m256"
+    AVX_128 = "__m128"
+
+    AVX_512d = "__m512d"
+    AVX_512 = "__m512"
+
+    NEON_64x2 = "float64x2_t"
+    NEON_32x4 = "float32x4_t"
+
+    def __str__(self):
+        return self.value
+
+
+class Variable:
+    def __init__(self, name: str, dtype: ScalarTypes | VectorTypes):
+        self._name = name
+        self._dtype = dtype
+
+    def __str__(self):
+        return f"{self._dtype} {self._name}"
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def dtype(self) -> ScalarTypes | VectorTypes:
+        return self._dtype
+
+
+def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets:
+    match vtype:
+        case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d:
+            return InstructionSets.SSE3
+        case VectorTypes.AVX_256 | VectorTypes.AVX_256d:
+            return InstructionSets.AVX
+        case VectorTypes.AVX_512 | VectorTypes.AVX_512d:
+            return InstructionSets.AVX512
+        case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2:
+            return InstructionSets.NEON
+
+
+def intrin_prefix(instruction_set: InstructionSets, double_prec: bool):
+    match instruction_set:
+        case InstructionSets.SSE3:
+            return "_mm"
+        case InstructionSets.AVX:
+            return "_mm256"
+        case InstructionSets.AVX512:
+            return "_mm512"
+        case InstructionSets.NEON:
+            return "vgetq" if double_prec else "vget"
+        case _:
+            raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def intrin_suffix(instruction_set: InstructionSets, double_prec: bool):
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        return "pd" if double_prec else "ps"
+    elif instruction_set in [InstructionSets.NEON]:
+        return "f64" if double_prec else "f32"
+    else:
+        raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str):
+    return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})"
+
+
+def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset):
+    return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})"
+
+
+def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str):
+    return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})"
+
+
+def generate_cvts_intrin(double_prec: bool, v: str):
+    convert_suffix = "f64" if double_prec else "f32"
+    intrin_suffix = "d" if double_prec else "s"
+    return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})"
+
+
+def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps):
+    prefix = intrin_prefix(instruction_set, double_prec)
+    suffix = intrin_suffix(instruction_set, double_prec)
+    return f"{prefix}_horizontal_{op.op_name}_{suffix}"
+
+
+def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable):
+    double_prec = svar.dtype is ScalarTypes.Double
+    return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n"
+
+
+# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling
+def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable):
+    reduction_op = ReductionOps.Add
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    sname = scalar_var.name
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n"
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\treturn {sname} + {cvts(hadd('_v'))};\n"
+            else:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n"
+            else:
+                add_i = "_mm_hadd_ps(_i,_i)"
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\t__m128  _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \
+                        f"\treturn {sname} + {cvts(add_i)};\n"
+
+        case _:
+            raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}")
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable):
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    # generate specialized version for add operation
+    if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]:
+        return generate_simd_horizontal_add(scalar_var, vector_var)
+
+    sname = scalar_var.name
+    stype = scalar_var.dtype
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    opname = reduction_op.op_name
+    opstr = reduction_op.op_str
+
+    reduction_function = f"f{opname}" \
+        if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+    shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else ""
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n"
+            else:
+                body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n"
+            else:
+                body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \
+                        f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX512:
+            suffix = intrin_suffix(instruction_set, double_prec)
+            body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n"
+
+        case InstructionSets.NEON:
+            if double_prec:
+                body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n"
+            else:
+                body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \
+                        f"\t{stype} _r = vgetq_lane_f32(_w,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n"
+
+        case _:
+            raise ValueError(f"Unsupported instruction set {instruction_set}")
+
+    # finalize reduction
+    if reduction_function:
+        body += f"\treturn {reduction_function}(_r, {sname});\n"
+    else:
+        body += f"\treturn {sname} {opstr} _r;\n"
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+stypes = {
+    True: ScalarTypes.Double,
+    False: ScalarTypes.Float
+}
+
+vtypes_for_instruction_set = {
+    InstructionSets.SSE3: {
+        True: VectorTypes.SSE3_128d,
+        False: VectorTypes.SSE3_128
+    },
+    InstructionSets.AVX: {
+        True: VectorTypes.AVX_256d,
+        False: VectorTypes.AVX_256
+    },
+    InstructionSets.AVX512: {
+        True: VectorTypes.AVX_512d,
+        False: VectorTypes.AVX_512
+    },
+    InstructionSets.NEON: {
+        True: VectorTypes.NEON_64x2,
+        False: VectorTypes.NEON_32x4
+    },
+}
+
+guards_for_instruction_sets = {
+    InstructionSets.SSE3: "__SSE3__",
+    InstructionSets.AVX: "__AVX__",
+    InstructionSets.AVX512: '__AVX512F__',
+    InstructionSets.NEON: '_M_ARM64',
+}
+
+code = """#pragma once
+
+#include <cmath>
+
+"""
+
+for instruction_set in InstructionSets:
+    code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n"
+
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        code += "#include <immintrin.h>\n\n"
+    elif instruction_set == InstructionSets.NEON:
+        code += "#include <arm_neon.h>\n\n"
+    else:
+        ValueError(f"Missing header include for instruction set {instruction_set}")
+
+    for reduction_op in ReductionOps:
+        for double_prec in [True, False]:
+            scalar_var = Variable("dst", stypes[double_prec])
+            vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec])
+
+            code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n"
+
+    code += "#endif\n\n"
+
+print(code)
-- 
GitLab