From 4e688b867339c90a6fba3ad12c383d1e59211ed9 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 9 May 2025 11:50:36 +0200 Subject: [PATCH] Include reductions WIP state Squashed commit of the following: commit d2f04568f355d9fd55b907ee42399747355d63b0 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue May 6 09:43:47 2025 +0200 Add missing include of gpu atomics commit 013e6546902cc2e2b1e508d03c52f394a891df37 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 12:24:10 2025 +0200 Broadcast reduction init val as neutral element commit d9c3be8594a55ea74b63c320ccf516b627cb510c Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 12:23:38 2025 +0200 Minor typecheck fix commit 13e2c1181cf0cd9eb3d5b695ca5304c9d7c7048f Merge: 0a095f83 3e664665 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 11:34:09 2025 +0200 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit 0a095f8322df12f51e08551cb5fca509680a027c Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 11:33:56 2025 +0200 Move simd horizontal op header commit c6bcdefbcfb2ebe25ea28987827c2f7990aa7057 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 11:31:51 2025 +0200 Move gpu atomics header commit e1c2c4636f82fac2c7a7bbc9e65ff998f8a40e3e Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 11:30:16 2025 +0200 Add reductions param to LoopVectorizer commit 83fcc73a60d210e9a79d2257a6d7359572172757 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon May 5 10:32:32 2025 +0200 Add reductions param to AddOpenMP commit c76b897f87b2a4841ab0894249cdaf771d7941b5 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 30 16:09:09 2025 +0200 Drop ReductionFunctions and introduce PsReductionWriteBack commit 3e66466514bc1d426056369e21d0ed53580082f5 Author: Frederik Hennig <frederik.hennig@fau.de> Date: Wed Apr 30 15:47:44 2025 +0200 Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> commit c430559032934ce9234dfad11fea2d60bd37c6ff Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 30 15:39:06 2025 +0200 Fix missing resolution of ConstantFunctions on GPU platforms commit 2ca58226cdac171f0373f4cd23b4efb77e9e7505 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 30 15:28:14 2025 +0200 Minor adaptation of required gpu headers commit 65362ddfc2bed0a28a7c071962366c206d552d97 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 30 13:55:29 2025 +0200 Fix typecheck note for match args of PsConstantFunction commit c0af39ae08afad52498ffd683c5e64cc7fbbf9a9 Merge: a9c8d6a7 d0f8a04e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 30 13:30:40 2025 +0200 Merge branch 'v2.0-dev' into rangersbach/reductions commit a9c8d6a7ccf65e79fc57d3e92818becb513ae4d3 Author: Frederik Hennig <frederik.hennig@fau.de> Date: Sat Apr 26 21:01:57 2025 +0200 Apply 4 suggestion(s) to 2 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> commit 0e2a548a372bc95cbface7e0a7b22c24aa04d289 Merge: a14f13fb 9c5b9bce Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 17:42:42 2025 +0200 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit a14f13fb6bd17fb9dbe8e4db00a3ebd3fd2391a1 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 17:39:22 2025 +0200 Omit extra type context creation for scalar op in PsVecHorizontal commit 9c5b9bce9065759461bb0c6ed2e88b4a665789d6 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 16:35:17 2025 +0200 Omit unnecessary type context creations for PsVecHorizontal commit 508701f6d3f352d0adfb655b8ea9aae3a15a5745 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 16:32:16 2025 +0200 Fix lint commit 3757e179c0e3b73f715819dbef1c316ba1b13af8 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 16:27:07 2025 +0200 Omit unnecessary replace_symbol call commit f3105780593ce575581cc6231c7b02f87fe40319 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 16:15:37 2025 +0200 Add unit test for ReductionAssignment commit 4106f9fdf0a4f1c0c6806ac3bca2a6ae467d31d1 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:57:33 2025 +0200 Parameterize test_reduction_assignments with reduction ops commit 490ec9144b01c10c6a6e8d1a927fd42e37a56511 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:51:08 2025 +0200 Omit old stuff from reduction_op_to_expr commit 394836094f59bbb248a0c385510ff774a2d70457 Merge: 1580a2b0 c0df001f Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:45:27 2025 +0200 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit 1580a2b06b0aa060def7428cf5ac3f7938b81f92 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:45:11 2025 +0200 Adapt test_invalid_reduction_assignments to create new contexts for each subtest commit 325ca38652751eccdd33fe2cc9dc55735c6dfc0c Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:44:08 2025 +0200 Move checks and init value determination for ReductionAssignments to add_reduction_info commit 0c8654e3251dea9c4d0ca9e3fa14f54dc970564b Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:36:58 2025 +0200 Extend KernelAnalysis to check for invalid references to reduction symbols commit 705ac53161f9a2fe49a2898de36f118a79e2d6a2 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Apr 25 15:19:00 2025 +0200 Minor doc change commit c0df001f196655dd5b3ada6f1fa3909b90584abd Author: Frederik Hennig <frederik.hennig@fau.de> Date: Fri Apr 25 12:59:07 2025 +0200 Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> commit da00b78ff50a2b5e61ae0d9ad53f56d3a6bc4c4d Author: Frederik Hennig <frederik.hennig@fau.de> Date: Fri Apr 25 12:58:57 2025 +0200 Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> commit 3010daed816e9fe4ed9b685b1f666c75736d8222 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 21:08:49 2025 +0200 Add typification test for PsVecHorizontal commit 94dcf3c5362238f1203b88f8a6a5873431749dc5 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 20:47:07 2025 +0200 Add unit test for freezing of ReductionAssignments commit f678a2fa2c38e2fa202a2c2a12472cdcc229b3d2 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 20:36:36 2025 +0200 Add unit test for checking border cases of freezing illegal usages of ReductionAssignments commit eb6e8c0f8c98d00cb56e6dfc7b7add5036e766ae Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 20:35:16 2025 +0200 Check if reduction symbol is (illegally) accessed before/after reduction assignment commit f0aba2e948fef8f535644e80b0f7d35e0e5f60e0 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 20:06:25 2025 +0200 Document attributes of ReductionInfo commit 081d11ad152bfa6e1f809af7d20789f00d39e5a9 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 19:10:21 2025 +0200 Fix indent for error handling in freeze commit dae36dd6edb5140d3bb616d5ade0d61b768d03fb Merge: 603e6a3f 6046db6e Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 18:53:39 2025 +0200 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit 6046db6e13ddd875e7c68edd9f99fb7a7101f47f Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Apr 24 17:54:31 2025 +0200 Move symbol handling for reductions to context and add more checks commit 603e6a3fce53a0bbd4d2b21faa12c34da3936b1b Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 16:46:27 2025 +0200 Fix docs commit b0904d93d48f0a165e4b1561e3b267db09986221 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 16:45:26 2025 +0200 Fix typecheck commit 3c3283cd4a823b7aeac883a794c2e2fd05e984e2 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 15:44:30 2025 +0200 Merge reduction user guides into one document commit b4cabdd81a3e94f2881cb26798d6588badb1c377 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 14:57:25 2025 +0200 Add consistency check for PsVecHorizontal in typifier commit 935beb559495c5094a42cf0e807ca8ba203beb6d Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 14:41:04 2025 +0200 Move kernel AST modifications for reductions to distinct function commit 8394f0f4c115cb37c5e53f3e08f62a32bd128a3e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 14:18:29 2025 +0200 Move reduction_op_mapping.py commit 7b74cafaedc589a52f91e7a28d60528956edd19f Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 13:01:20 2025 +0200 Fix import commit 9e61ccebe4e6a37205441f94573f714e987d17cd Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 12:58:38 2025 +0200 Omit lanes for match args in PsVecHorizontal commit 36124cd24e89342ae96dd0bee8a05bdfc2542092 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 11:58:34 2025 +0200 Add check for typed symbols for ReductionAssignment constructor commit 427e53442e166cf34a23e7588a1f8cd72ce52438 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 11:35:47 2025 +0200 Remove more parts of reduction_assignment_from_str commit f7142b16ae26f30f96f07e0063c65389fbc34b3f Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 11:32:47 2025 +0200 Remove lanes arg from PsVecHorizontal commit 99726a97a0a834dfb53960ad1fb72be4f72849e2 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 11:30:29 2025 +0200 Add docstring to PsVecHorizontal commit 21df6f4b5915134b537c20398b8bdeec00b6b28e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 10:57:43 2025 +0200 Omit admonitions from docs commit a2060520faddc99c58c8f22eedfe267d09d525b1 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 23 10:54:44 2025 +0200 Get rid of reduction_assignment_from_str commit bd99cd11f489ac7ae7a5dd602df69b4b087d0f08 Merge: 5bef84a8 9d634352 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 2 19:47:55 2025 +0200 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit 5bef84a8e28279c46cd73b67352201622d9aa89e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 2 19:47:49 2025 +0200 Write howto guide for reductions commit f469e70e18c278385ab4237b3f30afb70da4c606 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Apr 2 19:19:28 2025 +0200 Extend test_reduction_gpu with assume_warp_aligned_block_size and use_block_fitting parameters commit 9d6343522ea1d79e6b79c170d69fbf8d4108a78a Merge: 492fb30a b008a9e9 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 31 14:14:38 2025 +0200 Merge branch 'rangersbach/reductions' of i10git.cs.fau.de:pycodegen/pystencils into rangersbach/reductions commit 492fb30a9da935fe0273201de79e90b1eb1ae338 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 31 13:53:34 2025 +0200 Adapt error messages for reduced assignments in freeze.py commit 3484e7f794e4e720d7bc5931b90a4b7caf2ffc59 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 31 13:49:47 2025 +0200 Rename compound op mapping funcs commit 7b43ffd2e12f5d69ccc30fdf451ee47d7caea6ae Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 31 13:45:15 2025 +0200 Add minor comment to ReductionInfo dataclass commit b008a9e9954b83fd371c572c321b26597211a9c1 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Mar 28 13:26:09 2025 +0100 Adapt guards for generated avx512 horizontal ops commit 806dcb6b2aaa42849d8af7f12b62b730eec7fa0e Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 27 17:02:57 2025 +0100 Move resolution of reductions to concrete gpu platform classes commit 4e5c89b9cd23610f27d61612b14a12e968724e3f Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 24 16:54:50 2025 +0100 Fix lint, typecheck commit f77909540a9d26ce9dc1decde5c9508bc1f2d14a Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Mar 24 16:22:40 2025 +0100 Use NPP library for numeric limits for CUDA, use std limits for HIP commit c31d407433bb075a42691866e87e59208bf99d90 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 18:56:41 2025 +0100 Try fixing required headers for cuda and hip for reductions commit 6a6e57f08bec708036387c78919f7d4e028d86d4 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 18:19:03 2025 +0100 Fix wrong rank being used for obtaining default block sizes commit 2c507d796f0c5abbff32386f4268d3bdb988c6fa Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 18:18:10 2025 +0100 Fix required headers for cuda/hip platforms commit 16a6e80d32e7ff12e151fd75ec6f2905903620df Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 17:11:13 2025 +0100 Add missing type fold for loop vectorizer again commit 5ee715d0df2651e745ff5de0524abfe24d48c968 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 17:04:47 2025 +0100 Reformat adapted files [skip ci] commit d7e6890cff8f327ce285a3360821a4341b2acc46 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 16:23:54 2025 +0100 Fix typecheck commit 9ec813bdd1c8dfd2a1f32bb300d6f9e7d172542f Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 16:07:01 2025 +0100 Employ optimized warp-level reduction based on check commit 90837d04eaad5b3ad049c11fa5af48cf0942e812 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 15:36:01 2025 +0100 Merge handling for GPU reductions into generic_gpu.py for the time being commit 0d7526c07b744f3cd1c7a692b794bd6e7f8bfc37 Merge: 974cf848 f8e5419f Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Mar 20 15:06:46 2025 +0100 Merge branch 'v2.0-dev' into rangersbach/reductions commit 974cf848bfe474a5003e95c907e3e1289b6a5454 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Mar 12 16:40:25 2025 +0100 Fix header incl commit c7f1518efc14326f9b5732e7e8f6ac7f07acc71a Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Mar 12 16:28:42 2025 +0100 Move manual atomic op implementations to new header commit 3202d8b8f4c6902660330806e18a93321ac8f928 Merge: 4a031fc1 cd25546f Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Mar 12 16:23:59 2025 +0100 Merge branch 'v2.0-dev' into rangersbach/reductions commit 4a031fc192fc3f7d30fde450e8ff1788d3ecc3dd Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Feb 27 11:49:00 2025 +0100 Fix lint commit a972d759e7e6e8e8e0c6241a70456184a1366143 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 26 19:15:45 2025 +0100 Fix getter for thread exec condition for dense/sparse iteration spaces in cuda.py commit 02be4d5e994e458e0e42d72045ec53355e7820d1 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Feb 21 19:35:48 2025 +0100 Fix typecheck commit 89a6f36ad50849267b3a95ef5a035d22566c5748 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Feb 21 18:17:44 2025 +0100 Fix lint commit a8479afadc2a95e2ecda861ce7fd186375477347 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Feb 21 18:11:51 2025 +0100 Refactor reductionfunction mechanism commit bcd83842f3d331fa039a96e3eab68c34b3beb6f3 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Feb 21 14:56:46 2025 +0100 Use full mask for CUDA reductions commit 2b6589b8fd5eea2e416432c9261884e9f50e4e1c Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 19 19:06:30 2025 +0100 Introduce masks for warp reductions and fix errors when shuffling warp results commit c7b564bed65615c77f1065dd4821b5a79d2244b3 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 19 19:04:57 2025 +0100 Fix CUDARuntimeError import commit ad292c2b42e01c38843eb1a5556ea2ced762f845 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 19 16:34:14 2025 +0100 Add initial version of warp-level reduction for CUDA commit ce816539159408b26b09b4bf6e1df0cbff437829 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 19 16:33:25 2025 +0100 Encapsulate fetching of kernel conditions for iteration spaces in separate function commit d10c65d43919f5ee611fc4a0ef7c79f3c3e65822 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 18 17:29:22 2025 +0100 Catch CUDARuntimeError for mising CUDA capable device in reduction GPU test commit 99125ca7de57e2d1880b355f88c7136d923fa103 Merge: 77a22268 c2c31e84 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 18 17:09:02 2025 +0100 Merge branch 'v2.0-dev' into rangersbach/reductions commit 77a2226818946a666f98b23e95a76c16b77c1a82 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 18 12:57:49 2025 +0100 Avoid duplicate definition of atomicMin/Max for HIP commit 0c40ed634587f5854d11f7851c7ba9eb5911f4ff Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 18:31:41 2025 +0100 Use import or skip mechanism for cupy commit ef185b4e48f0c0ca40aaba84c928289995537f45 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 18:18:42 2025 +0100 Fix ImportError for cupy commit d14898373c0503bd9bbde0d3c0ee35888519f11f Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 17:54:40 2025 +0100 Fix typecheck commit 13569a616ef3eb6a28a698e8163f0748d4a4c0c0 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 17:31:23 2025 +0100 Fix lint commit 71881893e551cfa3c5ce2217b8fad0ef751e3613 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 17:16:09 2025 +0100 Split reduction test into separate CPU/GPU tests commit fe3cd6cd19f2c11166e2b8f670a0a8a337b898ef Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 16:02:05 2025 +0100 Add generator for SIMD horizontal operations and the emitted code. commit 58cdb79206931dfe15a4fdb7f656d8d047a2a6b8 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 17 15:30:42 2025 +0100 Fix bug with doubly inverted sign for subtraction reductions commit 7306f4ddfbb9066494480f14cdb211ae657f79ba Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 12 15:41:34 2025 +0100 Minor fix commit 8e0a74784ccb0d22c89364871ee630ba67239b2e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 12 15:22:45 2025 +0100 Refactor PsVecHorizontal as PsBinOp commit eb7823a5b28589a45f968ac6415ee2a02543a3ab Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Feb 12 14:13:05 2025 +0100 Minor refactor of reduction ops commit b4b105be0ff674bd1cacaea75e5f92bf8a7fda3c Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 11 17:21:54 2025 +0100 Minor fix commit a959082925c8c55d6116cc91e263fc862ff50fb7 Merge: d4b7e78f 60a348f1 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 11 17:17:45 2025 +0100 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit d4b7e78fca17ca130c55959e330a161d07ebba80 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 11 17:16:23 2025 +0100 Add initial implementation for horizontal reductions for vectorization commit 60a348f1f3f39d184da872007756c3aeed13ccee Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 10 18:30:29 2025 +0100 Support atomic sub, min, max for fp reductions using custom implementations with CAS mechanism commit a71e0d318a6f96bef38ce4b7b260d8f3fd73d91d Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 10 18:03:52 2025 +0100 Temporarily change default CUDA block size for CUDA jit commit 5caafdd05712621bc990ce7ef13c08f232f260af Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Feb 10 15:13:51 2025 +0100 Add guard for INFINITY numeric limit macro used by cuda backend commit 4edb0f9794ac8ad5c8329f2f3f89026aeb18a6b1 Merge: a2a59d40 4876f0b7 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Feb 6 15:57:14 2025 +0100 Merge branch 'v2.0-dev' into rangersbach/reductions commit a2a59d40b66390cebe849870d1b9cf058da82850 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 4 18:04:15 2025 +0100 Wrap statement around generated atomic call [skip ci] commit d8ae900242264392479f4405678d9a1f1b177890 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Feb 4 17:49:37 2025 +0100 Use predefined macro values for numeric limits in cuda backend commit f60d9d5df3c87c54d587ad7496025e70ee2388f0 Author: Richard Angersbach <iwia025h@csnhr.nhr.fau.de> Date: Tue Feb 4 16:31:16 2025 +0100 Minor adaptations for reduction test commit 826ee8e26f2bb09b95bb473b347d06cd6a36207c Author: Richard Angersbach <iwia025h@csnhr.nhr.fau.de> Date: Tue Feb 4 16:30:13 2025 +0100 Try supporting pointer dtypes for reductions in cupy gpu jit commit 4c7fd40921a2486f8164efc84e455b59138ae6d1 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 31 14:56:08 2025 +0100 Fix lint [skip ci] commit 616f609f24551439403c13b0d282fab147099f9f Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 31 14:51:45 2025 +0100 Fix lint [skip ci] commit 0fb11858f2c65fc46c8dca469c75c28bf283dfdb Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 20:06:00 2025 +0100 Add CUDA backend for numeric limits commit 10def05e256d2c316f5063e48bbbfebe2c84ea85 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 19:59:46 2025 +0100 Fix typecheck commit e15d3cf7a043fc45b78747d85e035dcd314bcd42 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 19:15:13 2025 +0100 Add first CUDA reduction impl using atomic operations commit 6e08683b7e5fe681986a677a014cbcadafa017f2 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 18:04:02 2025 +0100 Add dummy implementations for unfold_function in cuda/sycl platforms commit dd8f421d774f59d1dd36d4fdcb76a51375f4bda9 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 16:38:46 2025 +0100 Introduce functions to be unfolded by platform into code blocks for reduction init and write-back commit 53807242b8f2eae36b75abb3d101d53b3948157a Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 30 16:20:25 2025 +0100 Remove orig_symbol from reduction info as it is not needed commit a9da7d432d3bb5ed03da5f08f7b5dbd94c17ff7d Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 29 11:56:00 2025 +0100 Fix lint [skip ci] commit 06dc234497d8860c4c7fde704ba26c8c3da030a1 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 29 11:48:26 2025 +0100 Use std::numeric_limits as NumericLimitsFunctions backend for cpu commit 2424c15725cd621fd8bfa573f928b82255a17693 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 20:05:51 2025 +0100 Move reduction properties for local and ptr variable into single dataclass commit e94c4980ed860b127a27743fd9727192d667c906 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 19:18:00 2025 +0100 Adapt reduction assignment interface and employ enums instead of strings for the binary operation employed commit f71ce708a1aaa876a700e880d8cd1b63a0d080ee Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 18:00:53 2025 +0100 Enforce usage of typed symbols for reductions commit c73deaf6d54da2b95310dc1e606ed132b465c874 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 13:11:14 2025 +0100 Fix mypy errors and move binop mapping function commit 3daaa5e5a1f92a5482cb838cd791ed062dae1398 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 12:47:15 2025 +0100 Fix typecheck commit 96b5cbf286a29882b496faee3b9fe3be481d8bb3 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 28 12:44:21 2025 +0100 Fix lint commit f0d2fde6848f9cddcf0c3b38ca169f2e85abc093 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 24 16:20:37 2025 +0100 Encapsulate mapping of binop strings to actual operands and now also use for considering initial value of passed reduction pointer value commit 4c726aa6aa2df6312252cb848ace95e790d62331 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 24 14:43:32 2025 +0100 Prepare reduction test for GPU support commit b352a2e2e8c2d7f4eeb0861dacff5d703ae51869 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 23 18:18:24 2025 +0100 Fix lint for jit/cpu_extension_module.py commit 5ca40eae11e6d018b165286183b8a8832348b808 Merge: 3e595df6 72fa8672 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 23 18:11:54 2025 +0100 Merge remote-tracking branch 'origin/rangersbach/reductions' into rangersbach/reductions commit 3e595df6c79cc1a7a8c2ff4ab86825e81aadbf43 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 17:35:50 2025 +0100 Refine test_reduction.py to check for result correctness commit ba697180cac45133f756364ef8798d8437852026 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 17:23:02 2025 +0100 Rewire existing code extraction of fields to support reduction pointer extraction commit f1c556e6f93d5fa042e12e8a0a9c57f3bdea47b7 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:30:43 2025 +0100 Integrate reduction pointers to parameters.py commit 777ab888d5032d7630827f91fd26c388a9a09db2 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:01:51 2025 +0100 Use literals for C macros used for the numeric limits commit 3e0daa67359c7ddc17264b7fd21aa0a0429552e5 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:01:00 2025 +0100 Propagate properties of reduction pointer symbols to kernel parameters commit c6eedfcda96e84e8279ee624ba3c113f2339bfbe Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 15:16:32 2025 +0100 Split reduction var property into local and pointer-based reduction var properties commit c51ae2b438d688f871ba45b69476ef0c3b475462 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 18:42:43 2025 +0100 Set type of reduced variable to pointer and write back via PsMemAcc commit 3c276118bec2981a9bfc5a0d9654fec358094269 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 17:34:06 2025 +0100 Fix declaration of local reduction var and write back to original variable commit 0a9abc2a24ec30a444613178a94382da5355a6ef Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 13:55:35 2025 +0100 Swap out neutral init values for reduced assignments with min/max op commit 45ab4e86617492462188a5fc46d3160450a54bf0 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Jan 20 17:46:49 2025 +0100 Try initializing kernel-local reduction variable copy commit 6a7a251f77d0274c32729bc5bfacfe0308d3fec9 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 15:06:57 2025 +0100 Try fix mypy no-redef error commit 2c15b9890291deea0cb929ae3a5221f3a0671a45 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 15:02:19 2025 +0100 Define type of init_val for reduction as Any commit dcdfff042120db9960f1859a60fdeb1890f02878 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:56:12 2025 +0100 Parameterize test_reduction.py for different reduction operations commit ffcd54e053a2ddda2e015d2836184f2fcaef59f3 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:55:42 2025 +0100 Fix min/max reductions commit b4dd0c8c55d26f87b4467f814c526fafc2ced76b Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:29:10 2025 +0100 Fix usage of numerical limits for init value of reduction commit 6f8fbdfe7a6cbf14c6f64a86315fa435ed0c9336 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 13:43:32 2025 +0100 Fix removal of function parameters for lhs symbols that are not declared in the kernel commit 55c9812023c384c7978f2536020d8587b7a12019 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 18:38:34 2025 +0100 Adapt comment of ReductionSymbolProperty commit 355d638aab1179db95204d490fe585f7f4fcb7c1 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 18:15:58 2025 +0100 Rename min/max of numeric limits enum commit 97c171f3cd5371152fd102d92e302fc13ae4ee2b Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 17:38:46 2025 +0100 Adaptations to reduction test commit dc5898a77e7b72a2221af8b66fa56640c2516e76 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 17:30:25 2025 +0100 Omit distinction between normal and reduced assignments in AssignmentCollection commit 8fb5af398d208cf9779f7b0c528237d0d79ef7fb Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:20:14 2025 +0100 Fix header include of limits.h commit f30ca33b9ea897e66c996759a40b1aebf43a3688 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:19:23 2025 +0100 Fix inheritance of special math function enum classes commit 71aaf722f0a3aecfcc79e2633311aa8f5677b42d Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:18:37 2025 +0100 Add back reduced_assign to sympyextensions interface commit f00708edcaec67412905fbd26842735152da9812 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:15:24 2025 +0100 Reformat reduction.py commit 99a3335135baf5da92ec56713f96beae37798b60 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 15:52:05 2025 +0100 Add omp reduction clauses for reduced symbols commit c96a94619ff058ba81d798b6f4bfbe06cd273535 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 13:33:12 2025 +0100 Add C function selection for numeric limits functions commit e5861425c5e2e4868e0c696763297768814c02ec Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 12:59:16 2025 +0100 Minor adaptation on how symbols are given reduction property commit 3d592ab07194bc623884b81413278d954434e89b Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:51:22 2025 +0100 Try fixing circular module import commit d8a717874a0156d0c27c75b04c859b3a4a98268d Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:50:51 2025 +0100 Add dictionary of reduced symbols to codegen context commit 35c8160bf5f9da900255583bbfb10f935d5b3687 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:19:09 2025 +0100 Minor import fixes commit 548375295a08f88bef9ba8e597c47fe7143d8139 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:42:00 2025 +0100 Introduce reduction symbol property and add to lhs of reduced symbol commit b263d752d8af83569517aa56ef17facfa26adc91 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:39:39 2025 +0100 Add functions for numeric limits (to be supported by the backends) commit fae371d48773f9423855be824f495f3f20abcdc2 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:36:18 2025 +0100 Get rid of reduction using the division operator commit c7b9bb522828f8d5e32d3c20ad7d17eee689d2eb Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 12:59:36 2025 +0100 Expose new reduced assignments to pystencils interface commit f54fa321869682e71edd3f2828725104de9abe36 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 11:36:53 2025 +0100 Fix relative module imports for newly introduced sympyextensions for reductions commit 6b8bff09ef5b1c3f451aacae6aa730a9a822f5b2 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 14 18:13:18 2025 +0100 Initial work for introducing reduction capabilities to pystencils Signed-off-by: zy69guqi <richard.angersbach@fau.de> commit 72fa86729d84028e8b900e8a9cd0f9a8cdfab401 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 17:35:50 2025 +0100 Refine test_reduction.py to check for result correctness commit 4f6f5580bc5cb302e1a064a9d642e66822155db6 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 17:23:02 2025 +0100 Rewire existing code extraction of fields to support reduction pointer extraction commit 4e748308aca64b02cfa689fb0356a9a38d8c35af Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:30:43 2025 +0100 Integrate reduction pointers to parameters.py commit 6c8ee44f148f13b456c0c9c754e434cc9cf5b59a Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:01:51 2025 +0100 Use literals for C macros used for the numeric limits commit 350a4eac9394ca7bb95231b5ad14a1526952e23c Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 16:01:00 2025 +0100 Propagate properties of reduction pointer symbols to kernel parameters commit b5dd2ef085b19e505d0331d8aa8f6dae9ee85eb0 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 22 15:16:32 2025 +0100 Split reduction var property into local and pointer-based reduction var properties commit 6bc3cf3f17ed3395dabc15a2207739de245cd038 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 18:42:43 2025 +0100 Set type of reduced variable to pointer and write back via PsMemAcc commit 9fd1c2ad9bb4f8c225627306f0369273092ef737 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 17:34:06 2025 +0100 Fix declaration of local reduction var and write back to original variable commit e0347f54f8b0b713525cfa473a7df63cb98ca8cb Merge: 90ca9ead 3fc9a049 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 17:11:10 2025 +0100 Merge branch 'rangersbach/reductions' of i10git.cs.fau.de:pycodegen/pystencils into rangersbach/reductions commit 3fc9a049683b0cbac6bfa721efd38db05c201236 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 21 13:55:35 2025 +0100 Swap out neutral init values for reduced assignments with min/max op commit 90ca9ead0199cd4f5988e6c43e9c9c5350f566b6 Author: zy69guqi <richard.angersbach@fau.de> Date: Mon Jan 20 17:46:49 2025 +0100 Try initializing kernel-local reduction variable copy commit 75ea862f50d5372126d1a934b4b1e15ba3dc8c85 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 15:06:57 2025 +0100 Try fix mypy no-redef error commit 3c5a93b4a0016a2c12e0f210fd1324816e6df15e Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 15:02:19 2025 +0100 Define type of init_val for reduction as Any commit 9bbb8181dcd0717dd61de853f33e11c1ca19d806 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:56:12 2025 +0100 Parameterize test_reduction.py for different reduction operations commit a3025645e30265621d41315e7a0449768225c361 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:55:42 2025 +0100 Fix min/max reductions commit bb984679607d2f3625849667f06c348245b1813a Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 14:29:10 2025 +0100 Fix usage of numerical limits for init value of reduction commit fff5a079171ad884069cd307f2777a2dc0aa68f6 Author: zy69guqi <richard.angersbach@fau.de> Date: Fri Jan 17 13:43:32 2025 +0100 Fix removal of function parameters for lhs symbols that are not declared in the kernel commit 1a1c23b57015ace0c821d0a709d2bf17fa67b42a Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 18:38:34 2025 +0100 Adapt comment of ReductionSymbolProperty commit f16d8e7978174c24f682202b65aa64e1d53003bb Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 18:15:58 2025 +0100 Rename min/max of numeric limits enum commit e9ee769d2b8d2f177611d6ca5c2ccc0394d83874 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 17:38:46 2025 +0100 Adaptations to reduction test commit 9a8e6f9bb9a14a144f80b678147c1a4c36456741 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 17:30:25 2025 +0100 Omit distinction between normal and reduced assignments in AssignmentCollection commit 9741c024245137405c2c7b09db63267d88d6c12b Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:20:14 2025 +0100 Fix header include of limits.h commit cf2ec0662b8ea423c252c71e4e926e7fc388d4da Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:19:23 2025 +0100 Fix inheritance of special math function enum classes commit ef9239ede7d00457d90aab5a1740894dfc47d21a Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:18:37 2025 +0100 Add back reduced_assign to sympyextensions interface commit 555a6a836408071ca32c705bdf0fdf5e5e610437 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 16:15:24 2025 +0100 Reformat reduction.py commit a16969bf5c054b93c5e8a1a69a291d8437bbaa35 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 15:52:05 2025 +0100 Add omp reduction clauses for reduced symbols commit 4ae330dc8e87ac8cbb83c6f402ab99cdcb9e9edb Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 13:33:12 2025 +0100 Add C function selection for numeric limits functions commit af855492661c5649d3286eb2153f369b3813fb88 Author: zy69guqi <richard.angersbach@fau.de> Date: Thu Jan 16 12:59:16 2025 +0100 Minor adaptation on how symbols are given reduction property commit b8718cb1d67b14b39b9d806ff3131179fa97e24e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:51:22 2025 +0100 Try fixing circular module import commit 53fc7ca4c0ad2e601a09050b3273781ded65542a Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:50:51 2025 +0100 Add dictionary of reduced symbols to codegen context commit 66ce43954c585e5d576f895ab0d60f62196db813 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 17:19:09 2025 +0100 Minor import fixes commit 719a76fba40197320d03bab06e1d139e6d24a724 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:42:00 2025 +0100 Introduce reduction symbol property and add to lhs of reduced symbol commit 778cfd51b4df56c7fccf3686b0c0d3273b43a202 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:39:39 2025 +0100 Add functions for numeric limits (to be supported by the backends) commit ba1458538a9c954803d26337f7b428f599421f2c Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 16:36:18 2025 +0100 Get rid of reduction using the division operator commit 558a0f20e082370a0bccd20b96a647e3536bc31e Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 12:59:36 2025 +0100 Expose new reduced assignments to pystencils interface commit 543bf118944b32b851b526964ee275d7a1808034 Author: zy69guqi <richard.angersbach@fau.de> Date: Wed Jan 15 11:36:53 2025 +0100 Fix relative module imports for newly introduced sympyextensions for reductions commit 8a2bf6648548c18ab47d71db429956e2bf1f52b6 Author: zy69guqi <richard.angersbach@fau.de> Date: Tue Jan 14 18:13:18 2025 +0100 Initial work for introducing reduction capabilities to pystencils Signed-off-by: zy69guqi <richard.angersbach@fau.de> --- docs/source/backend/gpu_codegen.md | 1 + docs/source/index.rst | 1 + docs/source/user_manual/reductions.md | 202 ++++++++++++ src/pystencils/__init__.py | 21 +- src/pystencils/backend/ast/analysis.py | 2 +- src/pystencils/backend/ast/vector.py | 82 ++++- src/pystencils/backend/emission/ir_printer.py | 15 +- src/pystencils/backend/functions.py | 35 +- .../backend/kernelcreation/analysis.py | 17 + .../backend/kernelcreation/context.py | 88 ++++- .../backend/kernelcreation/freeze.py | 58 +++- .../backend/kernelcreation/typification.py | 36 +- src/pystencils/backend/platforms/cuda.py | 110 ++++++- .../backend/platforms/generic_cpu.py | 47 ++- .../backend/platforms/generic_gpu.py | 139 ++++++-- src/pystencils/backend/platforms/hip.py | 16 +- src/pystencils/backend/platforms/platform.py | 7 +- src/pystencils/backend/platforms/sycl.py | 7 +- src/pystencils/backend/platforms/x86.py | 38 ++- .../backend/reduction_op_mapping.py | 24 ++ .../backend/transformations/add_pragmas.py | 16 + .../transformations/loop_vectorizer.py | 68 +++- .../transformations/select_functions.py | 44 ++- .../transformations/select_intrinsics.py | 16 +- src/pystencils/codegen/driver.py | 60 +++- src/pystencils/codegen/gpu_indexing.py | 4 +- .../pystencils_runtime/bits/gpu_atomics.h | 90 +++++ .../include/pystencils_runtime/cuda.cuh | 1 + .../include/pystencils_runtime/hip.h | 1 + .../simd_horizontal_helpers.h | 231 +++++++++++++ src/pystencils/jit/cpu_extension_module.py | 63 +++- src/pystencils/jit/gpu_cupy.py | 10 +- src/pystencils/simp/assignment_collection.py | 1 + src/pystencils/sympyextensions/__init__.py | 5 +- src/pystencils/sympyextensions/reduction.py | 82 +++++ tests/frontend/test_sympyextensions.py | 38 ++- tests/kernelcreation/test_reduction.py | 93 ++++++ .../nbackend/kernelcreation/test_analysis.py | 38 +++ tests/nbackend/kernelcreation/test_freeze.py | 83 +++++ .../kernelcreation/test_typification.py | 47 ++- util/generate_simd_horizontal_op.py | 309 ++++++++++++++++++ 41 files changed, 2112 insertions(+), 134 deletions(-) create mode 100644 docs/source/user_manual/reductions.md create mode 100644 src/pystencils/backend/reduction_op_mapping.py create mode 100644 src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h create mode 100644 src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h create mode 100644 src/pystencils/sympyextensions/reduction.py create mode 100644 tests/kernelcreation/test_reduction.py create mode 100644 tests/nbackend/kernelcreation/test_analysis.py create mode 100644 util/generate_simd_horizontal_op.py diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 3fe00840e..a95c36566 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -1,3 +1,4 @@ +(gpu_codegen)= # GPU Code Generation The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: diff --git a/docs/source/index.rst b/docs/source/index.rst index 6dba50af1..4e1070979 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Topics user_manual/symbolic_language user_manual/kernelcreation + user_manual/reductions user_manual/gpu_kernels user_manual/WorkingWithTypes diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md new file mode 100644 index 000000000..5b45a921c --- /dev/null +++ b/docs/source/user_manual/reductions.md @@ -0,0 +1,202 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import sympy as sp +import pystencils as ps +import numpy as np +import cupy as cp + +from enum import Enum +``` + +(guide_reductions)= +# Reductions in Pystencils + +Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, +such as computing sums, products over an array or finding its minima or maxima. + +## Specifying Assignments with Reductions + +In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`. +Here is a snippet creating a reduction assignment for adding up all elements of a field: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x = ps.fields(f"x: double[3D]", layout="fzyx") + +assign_sum = ps.AddReductionAssignment(r, x.center()) +``` + +For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the +right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions. + +**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.** + +The following reduction assignment classes are available in pystencils: +* `AddReductionAssignment`: Builds sum over elements +* `SubReductionAssignment`: Builds difference over elements +* `MulReductionAssignment`: Builds product over elements +* `MinReductionAssignment`: Finds minimum element +* `MaxReductionAssignment`: Finds maximum element + +:::{note} +Alternatívely, you can also make use of the `reduction_assignment` function +to specify reduction assignments: +::: + +```{code-cell} ipython3 +from pystencils.sympyextensions import reduction_assignment +from pystencils.sympyextensions.reduction import ReductionOp + +assign_sum = reduction_assignment(r, ReductionOp.Add, x.center()) +``` + +For other reduction operations, the following enums can be passed to `reduction_assignment`. + +```{code-cell} python3 +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Min = "min" + Max = "max" +``` + +## Generating and Running Reduction Kernels + +With the assignments being fully assembled, we can finally invoke the code generator and +create the kernel object via the {any}`create_kernel` function. + +### CPU Platforms + +For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled. + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel) +``` + +:::{note} +The generated reduction kernels may vary vastly for different platforms and optimizations. +You can find a detailed description of configuration choices and their impact on the generated code below. +::: + +The kernel can be compiled and run immediately. + +To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field +but also one for exporting reduction results. +The export mechanism can be seen in the previously generated code snippet. +Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`). +This pointer is used for exporting the reduction result back from the kernel. +Please note that the **values passed via pointer will not be overwritten** +but will be incorporated in the reduction computation. +Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. + +```{code-cell} ipython3 + kernel_func = kernel.compile() + + x_array = np.ones((4, 4, 4), dtype="float64") + reduction_result = np.zeros((1,), dtype="float64") + + kernel_func(x=x_array, r=reduction_result) + + reduction_result[0] +``` + +### GPU Platforms + +Please note that **reductions are currently only supported for CUDA platforms**. +Similar to the CPU section, a base variant for NVIDIA GPUs without +explicitly employing any optimizations is shown: + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + kernel_gpu = ps.create_kernel(assign_sum, cfg) + + ps.inspect(kernel_gpu) +``` + +The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer +now require device memory, i.e. instances of {any}`cupy.ndarray`. + +## Optimizations for Reductions + +Going beyond the aforementioned basic kernel configurations, +we now demonstrate optimization strategies for different platforms +that can be applied to reduction kernels and show what impact they have. + +### CPU Platforms + +For CPU platforms, standard optimizations are employing SIMD vectorization and shared-memory parallelism using OpenMP. +The supported SIMD instruction sets for reductions are: +* SSE3 +* AVX/AVX2 +* AVX512 + +Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. +**Note that reductions require `assume_inner_stride_one` to be enabled.** +This is due to the fact that other inner strides would require masked SIMD operations +which are not supported yet. + +```{code-cell} ipython3 +# configure SIMD vectorization +cfg = ps.CreateKernelConfig( + target=ps.Target.X86_AVX, +) +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True + +# configure OpenMP parallelization +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 + +kernel_cpu_opt = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel_cpu_opt) +``` + +### GPU Platforms + +As evident from the generated kernel for the base variant, atomic operations are employed +for updating the pointer holding the reduction result. +Using the *explicit warp-level instructions* provided by CUDA allows us to achieve higher performance compared to +only using atomic operations. +To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by +the hardware's warp size. +**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this +criterion by enabling `assume_warp_aligned_block_size`.** +While the default block sizes provided by the code generator already fulfill this criterion, +we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. + +You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.assume_warp_aligned_block_size = True + + kernel_gpu_opt = ps.create_kernel(assign_sum, cfg) + + kernel_func = kernel_gpu_opt.compile() + kernel_func.launch_config.fit_block_size((32, 1, 1)) + + ps.inspect(kernel_gpu_opt) +``` diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 07283d529..329f61d32 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -1,10 +1,6 @@ """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions""" -from .codegen import ( - Target, - CreateKernelConfig, - AUTO -) +from .codegen import Target, CreateKernelConfig, AUTO from .defaults import DEFAULTS from . import fd from . import stencil as stencil @@ -34,6 +30,13 @@ from .simp import AssignmentCollection from .sympyextensions.typed_sympy import TypedSymbol, DynamicType from .sympyextensions import SymbolCreator, tcast from .datahandling import create_data_handling +from .sympyextensions.reduction import ( + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, +) __all__ = [ "Field", @@ -61,6 +64,11 @@ __all__ = [ "AssignmentCollection", "Assignment", "AddAugmentedAssignment", + "AddReductionAssignment", + "SubReductionAssignment", + "MulReductionAssignment", + "MinReductionAssignment", + "MaxReductionAssignment", "assignment_from_stencil", "SymbolCreator", "create_data_handling", @@ -81,4 +89,5 @@ __all__ = [ ] from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py index edeba04f2..7032690a0 100644 --- a/src/pystencils/backend/ast/analysis.py +++ b/src/pystencils/backend/ast/analysis.py @@ -62,7 +62,7 @@ class UndefinedSymbolsCollector: case PsAssignment(lhs, rhs): undefined_vars = self(lhs) | self(rhs) - if isinstance(lhs, PsSymbolExpr): + if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr): undefined_vars.remove(lhs.symbol) return undefined_vars diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 705d25094..d7ae8d6a9 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -3,8 +3,9 @@ from __future__ import annotations from typing import cast from .astnode import PsAstNode -from .expressions import PsExpression, PsLvalue, PsUnOp +from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp from .util import failing_cast +from ...sympyextensions import ReductionOp from ...types import PsVectorType @@ -17,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): """Broadcast a scalar value to N vector lanes.""" __match_args__ = ("lanes", "operand") - + def __init__(self, lanes: int, operand: PsExpression): super().__init__(operand) self._lanes = lanes @@ -25,26 +26,83 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): @property def lanes(self) -> int: return self._lanes - + @lanes.setter def lanes(self, n: int): self._lanes = n def _clone_expr(self) -> PsVecBroadcast: return PsVecBroadcast(self._lanes, self._operand.clone()) - + def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecBroadcast): return False + return super().structurally_equal(other) and self._lanes == other._lanes + + +class PsVecHorizontal(PsBinOp, PsVectorOp): + """Perform a horizontal reduction across a vector onto a scalar base value. + + **Example:** vec_horizontal_add(s, v)` will compute `s + v[0] + v[1] + ... + v[n-1]`. + + Args: + scalar_operand: Scalar operand + vector_operand: Vector operand to be converted to a scalar value + reduction_op: Binary operation that is also used for the horizontal reduction + """ + + __match_args__ = ("scalar_operand", "vector_operand", "reduction_op") + + def __init__( + self, + scalar_operand: PsExpression, + vector_operand: PsExpression, + reduction_op: ReductionOp, + ): + super().__init__(scalar_operand, vector_operand) + self._reduction_op = reduction_op + + @property + def scalar_operand(self) -> PsExpression: + return self._op1 + + @scalar_operand.setter + def scalar_operand(self, op: PsExpression): + self._op1 = op + + @property + def vector_operand(self) -> PsExpression: + return self._op2 + + @vector_operand.setter + def vector_operand(self, op: PsExpression): + self._op2 = op + + @property + def reduction_op(self) -> ReductionOp: + return self._reduction_op + + @reduction_op.setter + def reduction_op(self, op: ReductionOp): + self._reduction_op = op + + def _clone_expr(self) -> PsVecHorizontal: + return PsVecHorizontal( + self._op1.clone(), self._op2.clone(), self._reduction_op + ) + + def structurally_equal(self, other: PsAstNode) -> bool: + if not isinstance(other, PsVecHorizontal): + return False return ( super().structurally_equal(other) - and self._lanes == other._lanes + and self._reduction_op == other._reduction_op ) class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): """Pointer-based vectorized memory access. - + Args: base_ptr: Pointer identifying the accessed memory region offset: Offset inside the memory region @@ -95,7 +153,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): @property def stride(self) -> PsExpression | None: return self._stride - + @stride.setter def stride(self, expr: PsExpression | None): self._stride = expr @@ -106,10 +164,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): def get_vector_type(self) -> PsVectorType: return cast(PsVectorType, self._dtype) - + def get_children(self) -> tuple[PsAstNode, ...]: - return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,)) - + return (self._ptr, self._offset) + ( + () if self._stride is None else (self._stride,) + ) + def set_child(self, idx: int, c: PsAstNode): idx = [0, 1, 2][idx] match idx: @@ -138,7 +198,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): and self._vector_entries == other._vector_entries and self._aligned == other._aligned ) - + def __repr__(self) -> str: return ( f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, " diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index ffb65181c..5a3836d50 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR from ..ast import PsAstNode from ..ast.expressions import PsBufferAcc -from ..ast.vector import PsVecMemAcc, PsVecBroadcast +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal if TYPE_CHECKING: from ...codegen import Kernel @@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel): class IRAstPrinter(BasePrinter): """Print the IR AST as pseudo-code. - + This printer produces a complete pseudocode representation of a pystencils AST. Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for each node defined in `ast <pystencils.backend.ast>`. @@ -77,6 +77,17 @@ class IRAstPrinter(BasePrinter): f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest ) + case PsVecHorizontal(scalar_operand, vector_operand, reduction_op): + pc.push_op(Ops.Weakest, LR.Middle) + scalar_operand_code = self.visit(scalar_operand, pc) + vector_operand_code = self.visit(vector_operand, pc) + pc.pop_op() + + return pc.parenthesize( + f"vec_horizontal_{reduction_op.name.lower()}({scalar_operand_code, vector_operand_code})", + Ops.Weakest, + ) + case _: return super().visit(node, pc) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index e815cbf99..a3da9a1de 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -3,9 +3,9 @@ from typing import Any, Sequence, TYPE_CHECKING from abc import ABC from enum import Enum -from ..types import PsType, PsNumericType +from ..sympyextensions import ReductionOp +from ..types import PsType, PsNumericType, PsTypeError from .exceptions import PsInternalCompilerError -from ..types import PsTypeError if TYPE_CHECKING: from .ast.expressions import PsExpression @@ -70,7 +70,7 @@ class MathFunctions(Enum): class PsMathFunction(PsFunction): - """Homogenously typed mathematical functions.""" + """Homogeneously typed mathematical functions.""" __match_args__ = ("func",) @@ -95,6 +95,33 @@ class PsMathFunction(PsFunction): return hash(self._func) +class PsReductionWriteBack(PsFunction): + """Function representing a reduction kernel's write-back step supported by the backend. + + Each platform has to materialize this function to a concrete implementation. + """ + + def __init__(self, reduction_op: ReductionOp) -> None: + super().__init__("WriteBackToPtr", 2) + self._reduction_op = reduction_op + + @property + def reduction_op(self) -> ReductionOp: + return self._reduction_op + + def __str__(self) -> str: + return f"{super().name}" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, PsReductionWriteBack): + return False + + return self._reduction_op == other._reduction_op + + def __hash__(self) -> int: + return hash(self._reduction_op) + + class ConstantFunctions(Enum): """Numerical constant functions. @@ -122,7 +149,7 @@ class PsConstantFunction(PsFunction): and will be broadcast by the vectorizer. """ - __match_args__ = ("func,") + __match_args__ = ("func",) def __init__( self, func: ConstantFunctions, dtype: PsNumericType | None = None diff --git a/src/pystencils/backend/kernelcreation/analysis.py b/src/pystencils/backend/kernelcreation/analysis.py index 1365e1ef3..e5f8b921e 100644 --- a/src/pystencils/backend/kernelcreation/analysis.py +++ b/src/pystencils/backend/kernelcreation/analysis.py @@ -13,6 +13,8 @@ from ...simp import AssignmentCollection from sympy.codegen.ast import AssignmentBase from ..exceptions import PsInternalCompilerError, KernelConstraintsError +from ...sympyextensions.reduction import ReductionAssignment +from ...sympyextensions.typed_sympy import TypedSymbol class KernelAnalysis: @@ -54,6 +56,8 @@ class KernelAnalysis: self._check_access_independence = check_access_independence self._check_double_writes = check_double_writes + self._reduction_symbols: set[TypedSymbol] = set() + # Map pairs of fields and indices to offsets self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict( set @@ -88,6 +92,14 @@ class KernelAnalysis: for asm in asms: self._visit(asm) + case ReductionAssignment(): + assert isinstance(obj.lhs, TypedSymbol) + + self._reduction_symbols.add(obj.lhs) + + self._handle_rhs(obj.rhs) + self._handle_lhs(obj.lhs) + case AssignmentBase(): self._handle_rhs(obj.rhs) self._handle_lhs(obj.lhs) @@ -152,6 +164,11 @@ class KernelAnalysis: f"{field} is read at {offsets} and written at {write_offset}" ) case sp.Symbol(): + if expr in self._reduction_symbols: + raise KernelConstraintsError( + f"Illegal access to reduction symbol {expr.name} outside of ReductionAssignment. " + ) + self._scopes.access_symbol(expr) for arg in expr.args: diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 68da893ff..319d6061c 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -1,12 +1,16 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Iterable, Iterator, Any from itertools import chain, count from collections import namedtuple, defaultdict import re +from ..ast.expressions import PsExpression, PsConstantExpr, PsCall +from ..functions import PsConstantFunction, ConstantFunctions from ...defaults import DEFAULTS from ...field import Field, FieldType +from ...sympyextensions import ReductionOp from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType from ..memory import PsSymbol, PsBuffer @@ -44,6 +48,26 @@ class FieldsInKernel: FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) +@dataclass(frozen=True) +class ReductionInfo: + """Information about a reduction operation, its neutral element in form of an initial value + and the pointer used by the kernel as write-back argument. + + Attributes: + =========== + + reduction_op : Reduction operation being performed + init_val : Initial value used to initialize local symbol + local_symbol : Kernel-local symbol used to accumulate intermediate reduction result + writeback_ptr_symbol : Symbol that is used to export the final reduction result + """ + + op: ReductionOp + init_val: PsExpression + local_symbol: PsSymbol + writeback_ptr_symbol: PsSymbol + + class KernelCreationContext: """Manages the translation process from the SymPy frontend to the backend AST, and collects all necessary information for the translation: @@ -75,6 +99,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) + self._reduction_data: dict[str, ReductionInfo] = dict() + self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -93,7 +119,7 @@ class KernelCreationContext: def index_dtype(self) -> PsIntegerType: """Data type used by default for index expressions""" return self._index_dtype - + def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType: """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are.""" match dtype: @@ -178,6 +204,61 @@ class KernelCreationContext: self._symbols[old.name] = new + def add_reduction_info( + self, + lhs_name: str, + lhs_dtype: PsNumericType, + reduction_op: ReductionOp, + ): + """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name.""" + + # make sure that lhs symbol never occurred before ReductionAssignment + if self.find_symbol(lhs_name): + raise KernelConstraintsError( + f"Cannot create reduction with symbol {lhs_name}: " + "Another symbol with the same name already exist." + ) + + # add symbol for lhs with pointer datatype for write-back mechanism + pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype)) + + # create kernel-local copy of lhs symbol + local_symb = self.get_new_symbol(f"{lhs_name}_local", lhs_dtype) + + # match for reduction operation and set neutral init_val + init_val: PsExpression + match reduction_op: + case ReductionOp.Add: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Sub: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Mul: + init_val = PsConstantExpr(PsConstant(1)) + case ReductionOp.Min: + init_val = PsCall(PsConstantFunction(ConstantFunctions.PosInfinity), []) + case ReductionOp.Max: + init_val = PsCall(PsConstantFunction(ConstantFunctions.NegInfinity), []) + case _: + raise PsInternalCompilerError( + f"Unsupported kind of reduction assignment: {reduction_op}." + ) + + # create reduction info and add to set + reduction_info = ReductionInfo( + reduction_op, init_val, local_symb, pointer_symb + ) + self._reduction_data[lhs_name] = reduction_info + + return reduction_info + + def find_reduction_info(self, name: str) -> ReductionInfo | None: + """Find a ReductionInfo with the given name in the lookup table, if it exists. + + Returns: + The ReductionInfo with the given name, or `None` if it does not exist. + """ + return self._reduction_data.get(name, None) + def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None ) -> PsSymbol: @@ -213,6 +294,11 @@ class KernelCreationContext: """Return an iterable of all symbols listed in the symbol table.""" return self._symbols.values() + @property + def reduction_data(self) -> dict[str, ReductionInfo]: + """Return a dictionary holding kernel-local reduction information for given symbol names.""" + return self._reduction_data + # Fields and Arrays @property diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 0737d4c74..0fb1bd986 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,9 +1,8 @@ from typing import overload, cast, Any from functools import reduce -from operator import add, mul, sub, truediv +from operator import add, mul, sub import sympy as sp -import sympy.core.relational import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment @@ -13,8 +12,10 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) +from ..reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc +from ...sympyextensions.reduction import ReductionAssignment, ReductionOp from ...sympyextensions.bit_masks import bit_conditional from ...field import Field, FieldType @@ -178,19 +179,46 @@ class FreezeExpressions: assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) - match expr.op: - case "+=": - op = add - case "-=": - op = sub - case "*=": - op = mul - case "/=": - op = truediv - case _: - raise FreezeError(f"Unsupported augmented assignment: {expr.op}.") + # transform augmented assignment to reduction op + str_to_reduction_op: dict[str, ReductionOp] = { + "+=": ReductionOp.Add, + "-=": ReductionOp.Sub, + "*=": ReductionOp.Mul, + "/=": ReductionOp.Div, + } + + # reuse existing handling for transforming reduction ops to expressions + return PsAssignment( + lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs) + ) - return PsAssignment(lhs, op(lhs.clone(), rhs)) + def map_ReductionAssignment(self, expr: ReductionAssignment): + assert isinstance(expr.lhs, TypedSymbol) + + rhs = self.visit(expr.rhs) + + assert isinstance(rhs, PsExpression) + + reduction_op = expr.reduction_op + lhs_symbol = expr.lhs + lhs_dtype = lhs_symbol.dtype + lhs_name = lhs_symbol.name + + if not isinstance(lhs_dtype, PsNumericType): + raise FreezeError("Reduction symbol must have a numeric data type.") + + # get reduction info from context + reduction_info = self._ctx.add_reduction_info( + lhs_name, lhs_dtype, reduction_op + ) + + # create new lhs from newly created local lhs symbol + new_lhs = PsSymbolExpr(reduction_info.local_symbol) + + # get new rhs from augmented assignment + new_rhs: PsExpression = reduction_op_to_expr(reduction_op, new_lhs, rhs) + + return PsAssignment(new_lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) @@ -563,7 +591,7 @@ class FreezeExpressions: def map_Not(self, neg: sympy.logic.Not) -> PsNot: arg = self.visit_expr(neg.args[0]) return PsNot(arg) - + def map_bit_conditional(self, conditional: bit_conditional): args = [self.visit_expr(arg) for arg in conditional.args] bitpos, mask, then_expr = args[:3] diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index e7a19bb50..c966262ea 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -49,7 +49,7 @@ from ..ast.expressions import ( PsNeg, PsNot, ) -from ..ast.vector import PsVecBroadcast, PsVecMemAcc +from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal from ..functions import PsMathFunction, CFunction, PsConstantFunction from ..ast.util import determine_memory_object from ..exceptions import TypificationError @@ -245,7 +245,7 @@ class TypeContext: f" Expression: {expr}" f" Type Context: {self._target_type}" ) - + case PsCast(cast_target, _) if cast_target is None: expr.target_type = self._target_type # endif @@ -604,6 +604,38 @@ class Typifier: else: tc.apply_dtype(PsBoolType(), expr) + case PsVecHorizontal(): + # bin op consisting of a scalar and a vector that is converted to a scalar + # -> whole expression should be treated as scalar + + self.visit_expr(expr.scalar_operand, tc) + + vector_op_tc = TypeContext() + self.visit_expr(expr.vector_operand, vector_op_tc) + + if tc.target_type is None or vector_op_tc.target_type is None: + raise TypificationError( + f"Unable to determine type of argument to vector horizontal: {expr}" + ) + + if not isinstance(tc.target_type, PsScalarType): + raise TypificationError( + f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}" + ) + + if not isinstance(vector_op_tc.target_type, PsVectorType): + raise TypificationError( + f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" + ) + + if vector_op_tc.target_type.scalar_type is not tc.target_type: + raise TypificationError( + f"Scalar type of vector operand {vector_op_tc.target_type} " + f"does not correspond to type of scalar operand {tc.target_type}" + ) + + tc.apply_dtype(tc.target_type, expr) + case PsBinOp(op1, op2): self.visit_expr(op1, tc) self.visit_expr(op2, tc) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 4e5c5f275..6e6488ee1 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,119 @@ from __future__ import annotations +import math + from .generic_gpu import GenericGpu +from ..ast import PsAstNode +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCall, + PsAnd, + PsConstantExpr, + PsSymbolExpr, +) +from ..ast.structural import ( + PsConditional, + PsStatement, + PsAssignment, + PsBlock, + PsStructuralNode, +) +from ..constants import PsConstant +from ..functions import CFunction +from ..literals import PsLiteral +from ..reduction_op_mapping import reduction_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType +from ...types.quick import SInt, UInt class CudaPlatform(GenericGpu): - """Platform for the CUDA GPU taret.""" + """Platform for the CUDA GPU target.""" @property def required_headers(self) -> set[str]: return super().required_headers | {'"pystencils_runtime/cuda.cuh"'} + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match reduction_op: + case ReductionOp.Sub: + actual_reduction_op = ReductionOp.Add + case _: + actual_reduction_op = reduction_op + + # check if thread is valid for performing reduction + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + + cond: PsExpression + shuffles: tuple[PsAssignment, ...] + if self._warp_size and self._assume_warp_aligned_block_size: + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall( + CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) + + # set up shuffle instructions for warp-level reduction + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple( + PsAssignment( + symbol_expr, + reduction_op_to_expr( + actual_reduction_op, + symbol_expr, + gen_shuffle_instr(pow(2, i - 1)), + ), + ) + for i in reversed(range(1, num_shuffles)) + ) + + # find first thread in warp + first_thread_in_warp = self._first_thread_in_warp(ispace) + + # set condition to only execute atomic operation on first valid thread in warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) + else: + # no optimization: only execute atomic add on valid thread + shuffles = () + cond = is_valid_thread + + # use atomic operation + func = CFunction( + f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void") + ) + func_args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional( + cond, PsBlock([PsStatement(PsCall(func, func_args))]) + ) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 20167ffd6..cbde419d4 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -2,16 +2,20 @@ from abc import ABC, abstractmethod from typing import Sequence import numpy as np -from pystencils.backend.ast.expressions import PsCall +from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr +from ..ast import PsAstNode from ..functions import ( CFunction, - PsMathFunction, MathFunctions, + PsMathFunction, PsConstantFunction, ConstantFunctions, + PsReductionWriteBack, ) -from ...types import PsIntegerType, PsIeeeFloatType +from ..reduction_op_mapping import reduction_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType from .platform import Platform from ..exceptions import MaterializationError @@ -24,7 +28,7 @@ from ..kernelcreation.iteration_space import ( ) from ..constants import PsConstant -from ..ast.structural import PsDeclaration, PsLoop, PsBlock +from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode from ..ast.expressions import ( PsSymbolExpr, PsExpression, @@ -60,11 +64,40 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, (PsMathFunction | PsConstantFunction)) + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, (PsReductionWriteBack | PsMathFunction | PsConstantFunction)) + + if isinstance(call_func, PsReductionWriteBack): + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptr_expr.dtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + symbol_expr.dtype, PsScalarType + ) + + ptr_access = PsMemAcc( + ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + + # create binop and potentially select corresponding function for e.g. min or max + potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr) + if isinstance(potential_call, PsCall): + potential_call.dtype = symbol_expr.dtype + return self.select_function(potential_call) + + return potential_call - func = call.function.func dtype = call.get_dtype() + func = call_func.func arg_types = (dtype,) * call.function.arg_count expr: PsExpression | None = None diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 60019c4cb..876da4c91 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,14 +1,23 @@ from __future__ import annotations + +import operator from abc import ABC, abstractmethod +from functools import reduce import numpy as np -from ...types import constify, deconstify, PsIntegerType +from ..ast import PsAstNode +from ...sympyextensions.reduction import ReductionOp +from ...types import ( + constify, + deconstify, + PsIntegerType, +) +from ...types.quick import SInt from ..exceptions import MaterializationError from .platform import Platform from ..memory import PsSymbol from ..kernelcreation import ( - KernelCreationContext, Typifier, IterationSpace, FullIterationSpace, @@ -17,7 +26,13 @@ from ..kernelcreation import ( ) from ..constants import PsConstant -from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..kernelcreation.context import KernelCreationContext +from ..ast.structural import ( + PsBlock, + PsConditional, + PsDeclaration, + PsStructuralNode, +) from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -25,19 +40,24 @@ from ..ast.expressions import ( PsCall, PsLookup, PsBufferAcc, + PsConstantExpr, + PsAdd, + PsRem, + PsEq, ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral + from ..functions import ( - PsMathFunction, MathFunctions, CFunction, + PsReductionWriteBack, + PsMathFunction, PsConstantFunction, ConstantFunctions, ) - int32 = PsSignedIntegerType(width=32, const=False) BLOCK_IDX = [ @@ -179,23 +199,38 @@ class GenericGpu(Platform): thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points """ + @property + @abstractmethod + def required_headers(self) -> set[str]: + return set() + + @abstractmethod + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + pass + def __init__( self, ctx: KernelCreationContext, + assume_warp_aligned_block_size: bool, + warp_size: int | None, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + self._warp_size = warp_size + self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) self._typify = Typifier(ctx) - @property - def required_headers(self) -> set[str]: - return set() - def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace ) -> PsBlock: @@ -206,11 +241,70 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, (PsMathFunction | PsConstantFunction)) + @staticmethod + def _get_condition_for_translation(ispace: IterationSpace): + + if isinstance(ispace, FullIterationSpace): + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + ctr_expr = PsExpression.make(dim.counter) + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + + return condition + elif isinstance(ispace, SparseIterationSpace): + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + stop = PsExpression.make(ispace.index_list.shape[0]) + + return PsLt(sparse_ctr_expr.clone(), stop) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + @staticmethod + def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: + """Returns thread indices multiplied with block dimension strides per dimension.""" + + return tuple( + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) + ) + + def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression: + """Returns expression that determines whether a thread is the first within a warp.""" + + tids_per_dim = GenericGpu._thread_index_per_dim(ispace) + tid: PsExpression = tids_per_dim[0] + for t in tids_per_dim[1:]: + tid = PsAdd(tid, t) + + return PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) + + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, (PsReductionWriteBack | PsMathFunction | PsConstantFunction)) + + if isinstance(call_func, PsReductionWriteBack): + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + + return self.resolve_reduction(ptr_expr, symbol_expr, op) - func = call.function.func dtype = call.get_dtype() + func = call_func.func arg_types = (dtype,) * call.function.arg_count expr: PsExpression | None = None @@ -273,7 +367,7 @@ class GenericGpu(Platform): case ConstantFunctions.PosInfinity: expr = PsExpression.make(PsLiteral(f"PS_FP{dtype.width}_INFINITY", dtype)) - + case ConstantFunctions.NegInfinity: expr = PsExpression.make(PsLiteral(f"PS_FP{dtype.width}_NEG_INFINITY", dtype)) @@ -285,7 +379,7 @@ class GenericGpu(Platform): if isinstance(dtype, PsIntegerType): expr = self._select_integer_function(call) - if expr is not None: + if expr is not None: if expr.dtype is None: typify = Typifier(self._ctx) typify(expr) @@ -304,7 +398,6 @@ class GenericGpu(Platform): ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] - conds = [] dimensions = ispace.dimensions_in_loop_order() @@ -318,14 +411,9 @@ class GenericGpu(Platform): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - conds.append(PsLt(ctr_expr, dim.stop)) - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock(indexing_decls + [PsConditional(cond, body)]) def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace @@ -355,8 +443,5 @@ class GenericGpu(Platform): ] body.statements = mappings + body.statements - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock([sparse_idx_decl, PsConditional(cond, body)]) diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index 6c2c79798..b712ac699 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -1,11 +1,25 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast import PsAstNode +from ..ast.expressions import PsExpression +from ..ast.structural import PsStructuralNode +from ..exceptions import MaterializationError +from ...sympyextensions import ReductionOp class HipPlatform(GenericGpu): - """Platform for the HIP GPU taret.""" + """Platform for the HIP GPU target.""" @property def required_headers(self) -> set[str]: return super().required_headers | {'"pystencils_runtime/hip.h"'} + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + + raise MaterializationError("Reductions are yet not supported in HIP backend.") diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index a3e0b6328..f66b15741 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from ..ast import PsAstNode from ...types import PsIntegerType -from ..ast.structural import PsBlock +from ..ast.structural import PsBlock, PsStructuralNode from ..ast.expressions import PsCall, PsExpression, PsTernary, PsGe, PsLe from ..functions import PsMathFunction, MathFunctions from ..constants import PsConstant @@ -38,7 +39,9 @@ class Platform(ABC): pass @abstractmethod - def select_function(self, call: PsCall) -> PsExpression: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: """Select an implementation for the given function on the given data type. If no viable implementation exists, raise a `MaterializationError`. diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 4f018a21b..dd043f06b 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -1,12 +1,13 @@ from __future__ import annotations +from ..ast import PsAstNode from ..functions import CFunction, PsMathFunction, MathFunctions from ..kernelcreation.iteration_space import ( IterationSpace, FullIterationSpace, SparseIterationSpace, ) -from ..ast.structural import PsDeclaration, PsBlock, PsConditional +from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode from ..ast.expressions import ( PsExpression, PsSymbolExpr, @@ -52,7 +53,9 @@ class SyclPlatform(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: assert isinstance(call.function, PsMathFunction) func = call.function.func diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 3f0285f0c..16dbc14c4 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Sequence +from typing import Sequence, Tuple from enum import Enum from functools import cache @@ -17,8 +17,9 @@ from ..ast.expressions import ( PsCast, PsCall, ) -from ..ast.vector import PsVecMemAcc, PsVecBroadcast -from ...types import PsCustomType, PsVectorType, PsPointerType +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal +from ...sympyextensions import ReductionOp +from ...types import PsCustomType, PsVectorType, PsPointerType, PsType from ..constants import PsConstant from ..exceptions import MaterializationError @@ -132,6 +133,8 @@ class X86VectorCpu(GenericVectorCpu): else: headers = {"<immintrin.h>"} + headers.update({'"pystencils_runtime/simd_horizontal_helpers.h"'}) + return super().required_headers | headers def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: @@ -160,7 +163,14 @@ class X86VectorCpu(GenericVectorCpu): ) -> PsExpression: match expr: case PsUnOp() | PsBinOp(): - func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype()) + vtype: PsType + if isinstance(expr, PsVecHorizontal): + # return type of expression itself is scalar, but input argument to intrinsic is a vector + vtype = expr.vector_operand.get_dtype() + else: + vtype = expr.get_dtype() + + func = _x86_op_intrin(self._vector_arch, expr, vtype) intrinsic = func(*operands) intrinsic.dtype = func.return_type return intrinsic @@ -339,6 +349,7 @@ def _x86_op_intrin( prefix = varch.intrin_prefix(vtype) suffix = varch.intrin_suffix(vtype) rtype = atype = varch.intrin_type(vtype) + atypes: Tuple[PsType, ...] = () match op: case PsVecBroadcast(): @@ -346,6 +357,16 @@ def _x86_op_intrin( if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: suffix += "x" atype = vtype.scalar_type + case PsVecHorizontal(): + # horizontal add instead of sub avoids double inversion of sign + actual_op = ( + ReductionOp.Add + if op.reduction_op == ReductionOp.Sub + else op.reduction_op + ) + opstr = f"horizontal_{actual_op.name.lower()}" + rtype = vtype.scalar_type + atypes = (vtype.scalar_type, vtype) case PsAdd(): opstr = "add" case PsSub(): @@ -392,7 +413,9 @@ def _x86_op_intrin( case (SInt(64), Fp()) | ( Fp(), SInt(64), - ) if varch < X86VectorArch.AVX512: + ) if ( + varch < X86VectorArch.AVX512 + ): panic() # AVX512 only: cvtepiA_epiT if A > T case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: @@ -408,4 +431,7 @@ def _x86_op_intrin( ) num_args = 1 if isinstance(op, PsUnOp) else 2 - return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) + if not atypes: + atypes = (atype,) * num_args + + return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype) diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py new file mode 100644 index 000000000..a97a496e0 --- /dev/null +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -0,0 +1,24 @@ +from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv +from .exceptions import PsInternalCompilerError +from .functions import PsMathFunction, MathFunctions +from ..sympyextensions.reduction import ReductionOp + + +def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: + match op: + case ReductionOp.Add: + return PsAdd(op1, op2) + case ReductionOp.Sub: + return PsSub(op1, op2) + case ReductionOp.Mul: + return PsMul(op1, op2) + case ReductionOp.Div: + return PsDiv(op1, op2) + case ReductionOp.Min: + return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) + case ReductionOp.Max: + return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) + case _: + raise PsInternalCompilerError( + f"Found unsupported operation type for reduction assignments: {op}." + ) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index bd782422f..434903da0 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -8,7 +8,9 @@ from ..kernelcreation import KernelCreationContext from ..ast import PsAstNode from ..ast.structural import PsBlock, PsLoop, PsPragma, PsStructuralNode from ..ast.expressions import PsExpression +from ..kernelcreation.context import ReductionInfo +from ...types import PsScalarType __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"] @@ -103,12 +105,16 @@ class AddOpenMP: def __init__( self, ctx: KernelCreationContext, + reductions: Sequence[ReductionInfo] = (), nesting_depth: int = 0, num_threads: int | None = None, schedule: str | None = None, collapse: int | None = None, omit_parallel: bool = False, ) -> None: + + self._reductions = reductions + pragma_text = "omp" if not omit_parallel: @@ -122,6 +128,16 @@ class AddOpenMP: if num_threads is not None: pragma_text += f" num_threads({str(num_threads)})" + for reduction_info in self._reductions: + if isinstance(reduction_info.local_symbol.dtype, PsScalarType): + pragma_text += ( + f" reduction({reduction_info.op.value}: {reduction_info.local_symbol.name})" + ) + else: + NotImplementedError( + "OMP: Reductions for non-scalar data types are not supported yet." + ) + if collapse is not None: if collapse <= 0: raise ValueError( diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index e1e4fea50..b029b95a7 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -1,15 +1,22 @@ import numpy as np from enum import Enum, auto -from typing import cast, Callable, overload +from typing import cast, Callable, overload, Sequence +from ..kernelcreation.context import ReductionInfo from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration -from ..ast.expressions import PsExpression, PsTernary, PsGt -from ..ast.vector import PsVecBroadcast +from ..ast.structural import ( + PsLoop, + PsBlock, + PsDeclaration, + PsAssignment, + PsStructuralNode, +) +from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr +from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer @@ -48,10 +55,12 @@ class LoopVectorizer: self, ctx: KernelCreationContext, lanes: int, + reductions: Sequence[ReductionInfo] = (), trailing_iters: TrailingItersTreatment = TrailingItersTreatment.SCALAR_LOOP, ): self._ctx = ctx self._lanes = lanes + self._reductions = reductions self._trailing_iters = trailing_iters from ..kernelcreation import Typifier @@ -134,6 +143,45 @@ class LoopVectorizer: # Prepare vectorization context vc = VectorizationContext(self._ctx, self._lanes, axis) + # Prepare reductions found in loop body + simd_init_local_reduction_vars: list[PsStructuralNode] = [] + simd_writeback_local_reduction_vars: list[PsStructuralNode] = [] + for stmt in loop.body.statements: + for reduction_info in self._reductions: + match stmt: + case PsAssignment(lhs, _) if ( + isinstance(lhs, PsSymbolExpr) + and lhs.symbol is reduction_info.local_symbol + ): + # Vectorize symbol for local copy + local_symbol = lhs.symbol + vector_symb = vc.vectorize_symbol(local_symbol) + + # Declare and init vector + simd_init_local_reduction_vars += [ + self._type_fold( + PsDeclaration( + PsSymbolExpr(vector_symb), + PsVecBroadcast( + self._lanes, + reduction_info.init_val.clone(), + ), + ) + ) + ] + + # Write back vectorization result + simd_writeback_local_reduction_vars += [ + PsAssignment( + PsSymbolExpr(local_symbol), + PsVecHorizontal( + PsSymbolExpr(local_symbol), + PsSymbolExpr(vector_symb), + reduction_info.op, + ), + ) + ] + # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) @@ -224,10 +272,10 @@ class LoopVectorizer: ) return PsBlock( - [ - simd_stop_decl, - simd_step_decl, - simd_loop, + simd_init_local_reduction_vars + + [simd_stop_decl, simd_step_decl, simd_loop] + + simd_writeback_local_reduction_vars + + [ trailing_start_decl, trailing_loop, ] @@ -238,11 +286,13 @@ class LoopVectorizer: case LoopVectorizer.TrailingItersTreatment.NONE: return PsBlock( - [ + simd_init_local_reduction_vars + + [ simd_stop_decl, simd_step_decl, simd_loop, ] + + simd_writeback_local_reduction_vars ) @overload diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index 1580f9d39..5953bd47d 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,7 +1,9 @@ +from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode +from ..exceptions import MaterializationError from ..platforms import Platform from ..ast import PsAstNode -from ..ast.expressions import PsCall -from ..functions import PsMathFunction, PsConstantFunction +from ..ast.expressions import PsCall, PsExpression +from ..functions import PsMathFunction, PsConstantFunction, PsReductionWriteBack class SelectFunctions: @@ -17,9 +19,41 @@ class SelectFunctions: def visit(self, node: PsAstNode) -> PsAstNode: node.children = [self.visit(c) for c in node.children] - if isinstance(node, PsCall) and isinstance( - node.function, (PsMathFunction | PsConstantFunction) + if isinstance(node, PsAssignment): + rhs = node.rhs + if isinstance(rhs, PsCall) and isinstance( + rhs.function, PsReductionWriteBack + ): + resolved_func = self._platform.select_function(rhs) + + match resolved_func: + case (prepend, new_rhs): + assert isinstance(prepend, tuple) + + match new_rhs: + case PsExpression(): + return PsBlock( + prepend + (PsAssignment(node.lhs, new_rhs),) + ) + case PsStructuralNode(): + # special case: produces structural with atomic operation writing value back to ptr + return PsBlock(prepend + (new_rhs,)) + case _: + assert False, "Unexpected output from SelectFunctions." + case PsExpression(): + return PsAssignment(node.lhs, resolved_func) + case _: + raise MaterializationError( + f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions." + ) + else: + return node + elif isinstance(node, PsCall) and isinstance( + node.function, (PsMathFunction | PsConstantFunction) ): - return self._platform.select_function(node) + resolved_func = self._platform.select_function(node) + assert isinstance(resolved_func, PsExpression) + + return resolved_func else: return node diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 060192810..b20614393 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral from ...types import PsCustomType, PsVectorType, constify, deconstify from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp -from ..ast.vector import PsVecMemAcc +from ..ast.vector import PsVecMemAcc, PsVecHorizontal from ..exceptions import MaterializationError from ..functions import CFunction, PsMathFunction @@ -86,6 +86,10 @@ class SelectIntrinsics: new_rhs = self.visit_expr(rhs, sc) return PsStatement(self._platform.vector_store(lhs, new_rhs)) + case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal): + new_rhs = self.visit_expr(rhs, sc) + return PsAssignment(lhs, new_rhs) + case _: node.children = [self.visit(c, sc) for c in node.children] @@ -93,7 +97,15 @@ class SelectIntrinsics: def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression: if not isinstance(expr.dtype, PsVectorType): - return expr + # special case: result type of horizontal reduction is scalar + if isinstance(expr, PsVecHorizontal): + scalar_op = expr.scalar_operand + vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc) + return self._platform.op_intrinsic( + expr, [scalar_op, vector_op_to_scalar] + ) + else: + return expr match expr: case PsSymbolExpr(symb): diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index e9fc69b76..d9531b33c 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -19,14 +19,22 @@ from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter from .functions import Lambda from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration +from ..backend.kernelcreation.context import ReductionInfo from ..field import Field from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop -from ..backend.ast.expressions import PsExpression +from ..backend.functions import PsReductionWriteBack +from ..backend.ast.expressions import ( + PsExpression, + PsSymbolExpr, + PsCall, + PsMemAcc, + PsConstantExpr, +) +from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment, PsStructuralNode from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -183,6 +191,10 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.constants_eliminated = kernel_ast.clone() + # Extensions for reductions + for _, reduction_info in self._ctx.reduction_data.items(): + self._modify_kernel_ast_for_reductions(reduction_info, kernel_ast) + # Target-Specific optimizations if self._target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) @@ -283,6 +295,31 @@ class DefaultKernelCreationDriver: return kernel_body + def _modify_kernel_ast_for_reductions(self, + reduction_info: ReductionInfo, + kernel_ast: PsBlock): + # typify local symbol and write-back pointer expressions and initial value + typify = Typifier(self._ctx) + symbol_expr = typify(PsSymbolExpr(reduction_info.local_symbol)) + ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.writeback_ptr_symbol)) + init_val = typify(reduction_info.init_val) + + ptr_access = PsMemAcc( + ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + write_back_ptr = PsCall( + PsReductionWriteBack(reduction_info.op), + [ptr_symbol_expr, symbol_expr], + ) + + # declare and init local copy with neutral element + prepend_ast: list[PsStructuralNode] = [PsDeclaration(symbol_expr, init_val)] + # write back result to reduction target variable + append_ast: list[PsStructuralNode] = [PsAssignment(ptr_access, write_back_ptr)] + + # modify AST + kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast + def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock: canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) @@ -321,6 +358,7 @@ class DefaultKernelCreationDriver: add_omp = AddOpenMP( self._ctx, + reductions=list(self._ctx.reduction_data.values()), nesting_depth=omp_options.get_option("nesting_depth"), num_threads=omp_options.get_option("num_threads"), schedule=omp_options.get_option("schedule"), @@ -381,7 +419,8 @@ class DefaultKernelCreationDriver: self._target, cast(PsScalarType, self._ctx.default_dtype) ) - vectorizer = LoopVectorizer(self._ctx, num_lanes) + vectorizer = LoopVectorizer(self._ctx, num_lanes, + list(self._ctx.reduction_data.values())) def loop_predicate(loop: PsLoop): return loop.counter.symbol == inner_loop_dim.counter @@ -405,14 +444,18 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) if warp_size is None and assume_warp_aligned_block_size: - warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") + warn( + "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`." + ) return GpuIndexing( self._ctx, @@ -457,6 +500,11 @@ class DefaultKernelCreationDriver: else None ) + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) + warp_size: int | None = self._cfg.gpu.get_option("warp_size") + GpuPlatform: type match self._target: case Target.CUDA: @@ -468,6 +516,8 @@ class DefaultKernelCreationDriver: return GpuPlatform( self._ctx, + assume_warp_aligned_block_size, + warp_size, thread_mapping=thread_mapping, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 43b612bd7..f5606da02 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, + rank: int, num_work_items: _Dim3Lambda, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool, @@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._assume_warp_aligned_block_size = assume_warp_aligned_block_size - default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + default_bs = GpuLaunchConfiguration.get_default_block_size(rank) self._default_block_size = default_bs self._init_block_size: dim3 = default_bs self._compute_block_size: ( @@ -598,6 +599,7 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( + rank, num_work_items, self._hw_props, self._assume_warp_aligned_block_size, diff --git a/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h b/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h new file mode 100644 index 000000000..6de5c3321 --- /dev/null +++ b/src/pystencils/include/pystencils_runtime/bits/gpu_atomics.h @@ -0,0 +1,90 @@ +#pragma once + +// No direct implementation for all atomic operations available +// -> add support by custom implementations using a CAS mechanism + +#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +#endif + +#ifdef __CUDA_ARCH__ + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/include/pystencils_runtime/cuda.cuh b/src/pystencils/include/pystencils_runtime/cuda.cuh index 6a22e0b90..691519a01 100644 --- a/src/pystencils/include/pystencils_runtime/cuda.cuh +++ b/src/pystencils/include/pystencils_runtime/cuda.cuh @@ -3,3 +3,4 @@ #include <cuda_fp16.h> #include "./bits/gpu_infinities.h" +#include "./bits/gpu_atomics.h" diff --git a/src/pystencils/include/pystencils_runtime/hip.h b/src/pystencils/include/pystencils_runtime/hip.h index 10084103a..32d5e84c6 100644 --- a/src/pystencils/include/pystencils_runtime/hip.h +++ b/src/pystencils/include/pystencils_runtime/hip.h @@ -3,6 +3,7 @@ #include <hip/hip_fp16.h> #include "./bits/gpu_infinities.h" +#include "./bits/gpu_atomics.h" #ifdef __HIPCC_RTC__ typedef __hip_uint8_t uint8_t; diff --git a/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h b/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h new file mode 100644 index 000000000..bd1889153 --- /dev/null +++ b/src/pystencils/include/pystencils_runtime/simd_horizontal_helpers.h @@ -0,0 +1,231 @@ +#pragma once + +#include <cmath> + +#if defined(__SSE3__) +#include <immintrin.h> + +inline double _mm_horizontal_add_pd(double dst, __m128d src) { + __m128d _v = src; + return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v)); +} + +inline float _mm_horizontal_add_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_hadd_ps(_v, _v); + return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h))); +} + +inline double _mm_horizontal_mul_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return dst * _r; +} + +inline float _mm_horizontal_mul_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm_horizontal_min_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmin(_r, dst); +} + +inline float _mm_horizontal_min_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm_horizontal_max_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmax(_r, dst); +} + +inline float _mm_horizontal_max_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX__) +#include <immintrin.h> + +inline double _mm256_horizontal_add_pd(double dst, __m256d src) { + __m256d _v = src; + __m256d _h = _mm256_hadd_pd(_v, _v); + return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h))); +} + +inline float _mm256_horizontal_add_ps(float dst, __m256 src) { + __m256 _v = src; + __m256 _h = _mm256_hadd_ps(_v, _v); + __m128 _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h)); + return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i)); +} + +inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); + return dst * _r; +} + +inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm256_horizontal_min_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); + return fmin(_r, dst); +} + +inline float _mm256_horizontal_min_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm256_horizontal_max_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); + return fmax(_r, dst); +} + +inline float _mm256_horizontal_max_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX512F__) +#include <immintrin.h> + +inline double _mm512_horizontal_add_pd(double dst, __m512d src) { + double _r = _mm512_reduce_add_pd(src); + return dst + _r; +} + +inline float _mm512_horizontal_add_ps(float dst, __m512 src) { + float _r = _mm512_reduce_add_ps(src); + return dst + _r; +} + +inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { + double _r = _mm512_reduce_mul_pd(src); + return dst * _r; +} + +inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { + float _r = _mm512_reduce_mul_ps(src); + return dst * _r; +} + +inline double _mm512_horizontal_min_pd(double dst, __m512d src) { + double _r = _mm512_reduce_min_pd(src); + return fmin(_r, dst); +} + +inline float _mm512_horizontal_min_ps(float dst, __m512 src) { + float _r = _mm512_reduce_min_ps(src); + return fmin(_r, dst); +} + +inline double _mm512_horizontal_max_pd(double dst, __m512d src) { + double _r = _mm512_reduce_max_pd(src); + return fmax(_r, dst); +} + +inline float _mm512_horizontal_max_ps(float dst, __m512 src) { + float _r = _mm512_reduce_max_ps(src); + return fmax(_r, dst); +} + +#endif + +#if defined(_M_ARM64) +#include <arm_neon.h> + +inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r += vgetq_lane_f64(_v,1); + return dst + _r; +} + +inline float vget_horizontal_add_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r += vget_lane_f32(_w,1); + return dst + _r; +} + +inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r *= vgetq_lane_f64(_v,1); + return dst * _r; +} + +inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r *= vget_lane_f32(_w,1); + return dst * _r; +} + +inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmin(_r, vgetq_lane_f64(_v,1)); + return fmin(_r, dst); +} + +inline float vget_horizontal_min_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmin(_r, vget_lane_f32(_w,1)); + return fmin(_r, dst); +} + +inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmax(_r, vgetq_lane_f64(_v,1)); + return fmax(_r, dst); +} + +inline float vget_horizontal_max_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmax(_r, vget_lane_f32(_w,1)); + return fmax(_r, dst); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index fca043db9..4d76ea9ca 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -92,6 +92,7 @@ class PsKernelExtensioNModule: # Kernels and call wrappers from ..backend.emission import CAstPrinter + printer = CAstPrinter(func_prefix="FUNC_PREFIX") for name, kernel in self._kernels.items(): @@ -200,13 +201,15 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ """ def __init__(self) -> None: - self._buffer_types: dict[Field, PsType] = dict() - self._array_extractions: dict[Field, str] = dict() - self._array_frees: dict[Field, str] = dict() + self._buffer_types: dict[Any, PsType] = dict() + self._array_extractions: dict[Any, str] = dict() + self._array_frees: dict[Any, str] = dict() self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() + self._pointer_extractions: dict[Parameter, str] = dict() + self._constraint_checks: list[str] = [] self._call: str | None = None @@ -232,38 +235,42 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ else: return None + def get_buffer(self, buffer_name: str) -> str: + """Get the Python buffer object for a given buffer name.""" + return f"buffer_{buffer_name}" + def get_field_buffer(self, field: Field) -> str: """Get the Python buffer object for the given field.""" - return f"buffer_{field.name}" + return self.get_buffer(field.name) - def extract_field(self, field: Field) -> None: - """Add the necessary code to extract the NumPy array for a given field""" - if field not in self._array_extractions: - extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name) - actual_dtype = self._buffer_types[field] + def extract_buffer(self, buffer: Any, name: str) -> None: + """Add the necessary code to extract the NumPy array for a given buffer""" + if buffer not in self._array_extractions: + extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name) + actual_dtype = self._buffer_types[buffer] # Check array type type_char = self._type_char(actual_dtype) if type_char is not None: - dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'" + dtype_cond = f"buffer_{name}.format[0] == '{type_char}'" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( cond=dtype_cond, what="data type", - name=field.name, + name=name, expected=str(actual_dtype), ) # Check item size itemsize = actual_dtype.itemsize - item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}" + item_size_cond = f"buffer_{name}.itemsize == {itemsize}" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( - cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize + cond=item_size_cond, what="itemsize", name=name, expected=itemsize ) - self._array_extractions[field] = extraction_code + self._array_extractions[buffer] = extraction_code - release_code = f"PyBuffer_Release(&buffer_{field.name});" - self._array_frees[field] = release_code + release_code = f"PyBuffer_Release(&buffer_{name});" + self._array_frees[buffer] = release_code def extract_scalar(self, param: Parameter) -> str: if param not in self._scalar_extractions: @@ -277,6 +284,26 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name + def extract_ptr(self, param: Parameter) -> str: + if param not in self._pointer_extractions: + ptr = param.symbol + ptr_dtype = ptr.dtype + + assert isinstance(ptr_dtype, PsPointerType) + + self._buffer_types[ptr] = ptr_dtype.base_type + self.extract_buffer(ptr, param.name) + buffer = self.get_buffer(param.name) + code = ( + f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + ) + + assert code is not None + + self._array_assoc_var_extractions[param] = code + + return param.name + def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] @@ -321,11 +348,13 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ actual_field_type = field.dtype self._buffer_types[prop.field] = actual_field_type - self.extract_field(prop.field) + self.extract_buffer(prop.field, field.name) for param in params: if param.is_field_parameter: self.extract_array_assoc_var(param) + elif isinstance(param.dtype, PsPointerType): + self.extract_ptr(param) else: self.extract_scalar(param) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 7d2fe0991..893589a01 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -11,7 +11,6 @@ except ImportError: from ..codegen import Target from ..field import FieldType -from ..types import PsType from .jit import JitBase, JitError, KernelWrapper from ..codegen import ( Kernel, @@ -20,7 +19,7 @@ from ..codegen import ( ) from ..codegen.gpu_indexing import GpuLaunchConfiguration from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr -from ..types import PsStructType, PsPointerType +from ..types import PsType, PsStructType, PsPointerType from ..include import get_pystencils_include_path @@ -186,10 +185,13 @@ class CupyKernelWrapper(KernelWrapper): arr.strides[coord] // arr.dtype.itemsize, ) break + elif isinstance(kparam.dtype, PsPointerType): + val = kwargs[kparam.name] + kernel_args.append(val) else: # scalar parameter - val: Any = kwargs[param.name] - adder(param, val) + val = kwargs[kparam.name] + adder(kparam, val) # Process Arguments diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index f1ba87154..03b4edccf 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,5 +1,6 @@ import itertools from copy import copy + from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 3933fd9b3..8c7d10a80 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType from .pointers import mem_acc +from .reduction import reduction_assignment, ReductionOp from .bit_masks import bit_conditional from .math import ( @@ -28,12 +29,14 @@ from .math import ( count_operations_in_ast, common_denominator, get_symmetric_part, - SymbolCreator + SymbolCreator, ) __all__ = [ "ConditionalFieldAccess", + "reduction_assignment", + "ReductionOp", "TypedSymbol", "CastFunc", "tcast", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py new file mode 100644 index 000000000..a1a9a026c --- /dev/null +++ b/src/pystencils/sympyextensions/reduction.py @@ -0,0 +1,82 @@ +from enum import Enum + +from sympy.codegen.ast import AssignmentBase + +from . import TypedSymbol + + +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Div = "/" + Min = "min" + Max = "max" + + +class ReductionAssignment(AssignmentBase): + """ + Base class for reduced assignments. + + Attributes: + =========== + + reduction_op : ReductionOp + Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. + """ + + _reduction_op = None # type: ReductionOp + + @property + def reduction_op(self): + return self._reduction_op + + @reduction_op.setter + def reduction_op(self, op): + self._reduction_op = op + + @classmethod + def _check_args(cls, lhs, rhs): + super()._check_args(lhs, rhs) + + if not isinstance(lhs, TypedSymbol): + raise TypeError(f"lhs of needs to be a TypedSymbol. Got {type(lhs)} instead.") + + +class AddReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Add + + +class SubReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Sub + + +class MulReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Mul + + +class MinReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Min + + +class MaxReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Max + + +# Mapping from ReductionOp enum to ReductionAssigment classes +_reduction_assignment_classes = { + cls.reduction_op: cls + for cls in [ + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, + ] +} + + +def reduction_assignment(lhs, op: ReductionOp, rhs): + if op not in _reduction_assignment_classes: + raise ValueError("Unrecognized operator %s" % op) + return _reduction_assignment_classes[op](lhs, rhs) diff --git a/tests/frontend/test_sympyextensions.py b/tests/frontend/test_sympyextensions.py index ad5d2513b..152527441 100644 --- a/tests/frontend/test_sympyextensions.py +++ b/tests/frontend/test_sympyextensions.py @@ -3,7 +3,9 @@ import numpy as np import sympy as sp import pystencils -from pystencils import Assignment +import pytest + +from pystencils import Assignment, TypedSymbol from pystencils.sympyextensions import replace_second_order_products from pystencils.sympyextensions import remove_higher_order_terms from pystencils.sympyextensions import complete_the_squares_in_exp @@ -27,6 +29,16 @@ from pystencils.sympyextensions.integer_functions import ( div_ceil, ) +from pystencils.sympyextensions.reduction import ( + ReductionOp, + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, + reduction_assignment, +) + def test_utility(): a = [1, 2] @@ -199,6 +211,30 @@ def test_count_operations(): assert ops["muls"] == 99 +@pytest.mark.parametrize("reduction_assignment_for_op", [ + (ReductionOp.Add, AddReductionAssignment), + (ReductionOp.Sub, SubReductionAssignment), + (ReductionOp.Mul, MulReductionAssignment), + (ReductionOp.Min, MinReductionAssignment), + (ReductionOp.Max, MaxReductionAssignment), +]) +def test_reduction_assignments( + reduction_assignment_for_op +): + reduction_op, reduction_assignment_type = reduction_assignment_for_op + + w = TypedSymbol("w", "float64") + v = sympy.symbols("v") + + assignment = reduction_assignment(w, reduction_op, 0) + + assert isinstance(assignment, reduction_assignment_type) + + # invalid assignment since v is not a typed symbol + with pytest.raises(TypeError): + _ = reduction_assignment(v, reduction_op, 0) + + def test_common_denominator(): x = sympy.symbols("x") expr = sympy.Rational(1, 2) + x * sympy.Rational(2, 3) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py new file mode 100644 index 000000000..1fb8efc81 --- /dev/null +++ b/tests/kernelcreation/test_reduction.py @@ -0,0 +1,93 @@ +import pytest +import numpy as np + +import pystencils as ps +from pystencils.sympyextensions import ReductionOp, reduction_assignment + +INIT_W = 5 +INIT_ARR = 2 +SIZE = 15 +SOLUTION = { + ReductionOp.Add: INIT_W + INIT_ARR * SIZE, + ReductionOp.Sub: INIT_W - INIT_ARR * SIZE, + ReductionOp.Mul: INIT_W * INIT_ARR**SIZE, + ReductionOp.Min: min(INIT_W, INIT_ARR), + ReductionOp.Max: max(INIT_W, INIT_ARR), +} + + +# get AST for kernel with reduction assignment +def get_reduction_assign_ast(dtype, op, config): + x = ps.fields(f"x: {dtype}[1d]") + w = ps.TypedSymbol("w", dtype) + + red_assign = reduction_assignment(w, op, x.center()) + + return ps.create_kernel([red_assign], config, default_dtype=dtype) + + +@pytest.mark.parametrize("instruction_set", ["sse", "avx"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max]) +def test_reduction_cpu(instruction_set, dtype, op): + vectorize_info = { + "instruction_set": instruction_set, + "assume_inner_stride_one": True, + } + + config = ps.CreateKernelConfig( + target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info + ) + + ast_reduction = get_reduction_assign_ast(dtype, op, config) + ps.show_code(ast_reduction) + kernel_reduction = ast_reduction.compile() + + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) + + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) + + +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_reduction_gpu( + dtype: str, + op: str, + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): + try: + import cupy as cp + from cupy_backends.cuda.api.runtime import CUDARuntimeError + + device_count = range(cp.cuda.runtime.getDeviceCount()) + print(f"Found {device_count} GPUs") + except ImportError: + pytest.skip(reason="CuPy is not available", allow_module_level=True) + except CUDARuntimeError: + pytest.skip( + reason="No CUDA capable device is detected", allow_module_level=True + ) + + cfg = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size + + ast_reduction = get_reduction_assign_ast(dtype, op, cfg) + ps.show_code(ast_reduction) + kernel_reduction = ast_reduction.compile() + + if use_block_fitting: + kernel_reduction.launch_config.fit_block_size((32, 1, 1)) + + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) + + array_gpu = cp.asarray(array) + reduction_array_gpu = cp.asarray(reduction_array) + + kernel_reduction(x=array_gpu, w=reduction_array_gpu) + assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) diff --git a/tests/nbackend/kernelcreation/test_analysis.py b/tests/nbackend/kernelcreation/test_analysis.py new file mode 100644 index 000000000..d68c0a5f3 --- /dev/null +++ b/tests/nbackend/kernelcreation/test_analysis.py @@ -0,0 +1,38 @@ +import pytest + +from pystencils import fields, TypedSymbol, AddReductionAssignment, Assignment, KernelConstraintsError +from pystencils.backend.kernelcreation import KernelCreationContext, KernelAnalysis +from pystencils.sympyextensions import mem_acc +from pystencils.types.quick import Ptr, Fp + + +def test_invalid_reduction_symbol_reassign(): + dtype = Fp(64) + ctx = KernelCreationContext(default_dtype=dtype) + analysis = KernelAnalysis(ctx) + + x = fields(f"x: [1d]") + w = TypedSymbol("w", dtype) + + # illegal reassign to already locally defined symbol (here: reduction symbol) + with pytest.raises(KernelConstraintsError): + analysis([ + AddReductionAssignment(w, 3 * x.center()), + Assignment(w, 0) + ]) + +def test_invalid_reduction_symbol_reference(): + dtype = Fp(64) + ctx = KernelCreationContext(default_dtype=dtype) + analysis = KernelAnalysis(ctx) + + x = fields(f"x: [1d]") + v = TypedSymbol("v", dtype) + w = TypedSymbol("w", dtype) + + # do not allow reduction symbol to be referenced on rhs of other assignments + with pytest.raises(KernelConstraintsError): + analysis([ + AddReductionAssignment(w, 3 * x.center()), + Assignment(v, w) + ]) \ No newline at end of file diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py index 5384c90f9..679a7cebc 100644 --- a/tests/nbackend/kernelcreation/test_freeze.py +++ b/tests/nbackend/kernelcreation/test_freeze.py @@ -8,6 +8,7 @@ from pystencils import ( create_numeric_type, TypedSymbol, DynamicType, + KernelConstraintsError, ) from pystencils.sympyextensions import tcast, bit_conditional from pystencils.sympyextensions.pointers import mem_acc @@ -44,6 +45,7 @@ from pystencils.backend.ast.expressions import ( PsArrayInitList, PsSubscript, PsMemAcc, + PsSymbolExpr, ) from pystencils.backend.constants import PsConstant from pystencils.backend.functions import PsMathFunction, MathFunctions, PsConstantFunction, ConstantFunctions @@ -66,6 +68,14 @@ from pystencils.sympyextensions.integer_functions import ( ceil_to_multiple, div_ceil, ) +from pystencils.sympyextensions.reduction import ( + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, +) +from pystencils.types import PsTypeError def test_freeze_simple(): @@ -519,6 +529,79 @@ def test_invalid_arrays(): _ = freeze(symb_arr) +@pytest.mark.parametrize("reduction_assignment_rhs_type", + [ + (AddReductionAssignment, PsAdd), + (SubReductionAssignment, PsSub), + (MulReductionAssignment, PsMul), + (MinReductionAssignment, PsCall), + (MaxReductionAssignment, PsCall), + ]) +def test_reduction_assignments( + reduction_assignment_rhs_type +): + x = fields(f"x: float64[1d]") + w = TypedSymbol("w", "float64") + + reduction_op, rhs_type = reduction_assignment_rhs_type + + ctx = KernelCreationContext() + freeze = FreezeExpressions(ctx) + + one = PsExpression.make(PsConstant(1, ctx.index_dtype)) + counter = ctx.get_symbol("ctr", ctx.index_dtype) + ispace = FullIterationSpace( + ctx, [FullIterationSpace.Dimension(one, one, one, counter)] + ) + ctx.set_iteration_space(ispace) + + expr = freeze(reduction_op(w, 3 * x.center())) + + info = ctx.find_reduction_info(w.name) + + assert isinstance(expr, PsAssignment) + assert isinstance(expr.lhs, PsSymbolExpr) + + assert expr.lhs.symbol == info.local_symbol + assert expr.lhs.dtype == w.dtype + + assert isinstance(expr.rhs, rhs_type) + + +def test_invalid_reduction_assignments(): + x = fields(f"x: float64[1d]") + w = TypedSymbol("w", "float64") + + assignment = Assignment(w, -1 * x.center()) + reduction_assignment = AddReductionAssignment(w, 3 * x.center()) + + expected_errors_for_invalid_cases = [ + # 1) Reduction symbol is used before ReductionAssignment. + # May only be used for reductions -> KernelConstraintsError + ([assignment, reduction_assignment], KernelConstraintsError), + # 2) Reduction symbol is used after ReductionAssignment. + # Reduction symbol is converted to pointer after freeze -> PsTypeError + ([reduction_assignment, assignment], PsTypeError), + # 3) Duplicate ReductionAssignment + # May only be used once for now -> KernelConstraintsError + ([reduction_assignment, reduction_assignment], KernelConstraintsError) + ] + + for invalid_assignment, error_class in expected_errors_for_invalid_cases: + ctx = KernelCreationContext() + freeze = FreezeExpressions(ctx) + + one = PsExpression.make(PsConstant(1, ctx.index_dtype)) + counter = ctx.get_symbol("ctr", ctx.index_dtype) + ispace = FullIterationSpace( + ctx, [FullIterationSpace.Dimension(one, one, one, counter)] + ) + ctx.set_iteration_space(ispace) + + with pytest.raises(error_class): + _ = [freeze(asm) for asm in invalid_assignment] + + def test_memory_access(): ctx = KernelCreationContext() freeze = FreezeExpressions(ctx) diff --git a/tests/nbackend/kernelcreation/test_typification.py b/tests/nbackend/kernelcreation/test_typification.py index d5b204e9f..62b0106b9 100644 --- a/tests/nbackend/kernelcreation/test_typification.py +++ b/tests/nbackend/kernelcreation/test_typification.py @@ -5,6 +5,7 @@ import numpy as np from typing import cast from pystencils import Assignment, TypedSymbol, Field, FieldType, AddAugmentedAssignment +from pystencils.sympyextensions import ReductionOp from pystencils.sympyextensions.pointers import mem_acc from pystencils.backend.ast.structural import ( @@ -34,9 +35,9 @@ from pystencils.backend.ast.expressions import ( PsTernary, PsMemAcc ) +from pystencils.backend.ast.vector import PsVecBroadcast, PsVecHorizontal from pystencils.backend.ast import dfs_preorder from pystencils.backend.ast.expressions import PsAdd -from pystencils.backend.ast.vector import PsVecBroadcast from pystencils.backend.constants import PsConstant from pystencils.backend.functions import CFunction, PsConstantFunction, ConstantFunctions from pystencils.types import constify, create_type, create_numeric_type, PsVectorType, PsTypeError @@ -744,6 +745,50 @@ def test_propagate_constant_type_in_broadcast(): assert expr.dtype == PsVectorType(fp16, 4) +def test_typify_horizontal_vector_reductions(): + ctx = KernelCreationContext() + typify = Typifier(ctx) + + reduction_op = ReductionOp.Add + stype = Fp(32) + vtype = PsVectorType(stype, 4) + + def create_symb_expr(name, tpe): + return PsExpression.make(ctx.get_symbol(name, tpe)) + + # create valid horizontal and check if expression type is scalar + result = typify( + PsVecHorizontal( + create_symb_expr("s1", stype), create_symb_expr("v1", vtype), ReductionOp.Add + ) + ) + assert result.get_dtype() == stype + + # create invalid horizontal by using scalar type for expected vector type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s2", stype), create_symb_expr("v2", stype), reduction_op + ) + ) + + # create invalid horizontal by using vector type for expected scalar type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s3", vtype), create_symb_expr("v3", vtype), reduction_op + ) + ) + + # create invalid horizontal where base type of vector does not match with scalar type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s4", Int(32)), create_symb_expr("v4", vtype), reduction_op + ) + ) + + def test_inference_fails(): ctx = KernelCreationContext() typify = Typifier(ctx) diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py new file mode 100644 index 000000000..1d652c6e1 --- /dev/null +++ b/util/generate_simd_horizontal_op.py @@ -0,0 +1,309 @@ +from enum import Enum + +FCT_QUALIFIERS = "inline" + + +class InstructionSets(Enum): + SSE3 = "SSE3" + AVX = "AVX" + AVX512 = "AVX512" + NEON = "NEON" + + def __str__(self): + return self.value + + +class ReductionOps(Enum): + Add = ("add", "+") + Mul = ("mul", "*") + Min = ("min", "min") + Max = ("max", "max") + + def __init__(self, op_name, op_str): + self.op_name = op_name + self.op_str = op_str + + +class ScalarTypes(Enum): + Double = "double" + Float = "float" + + def __str__(self): + return self.value + + +class VectorTypes(Enum): + SSE3_128d = "__m128d" + SSE3_128 = "__m128" + + AVX_256d = "__m256d" + AVX_256 = "__m256" + AVX_128 = "__m128" + + AVX_512d = "__m512d" + AVX_512 = "__m512" + + NEON_64x2 = "float64x2_t" + NEON_32x4 = "float32x4_t" + + def __str__(self): + return self.value + + +class Variable: + def __init__(self, name: str, dtype: ScalarTypes | VectorTypes): + self._name = name + self._dtype = dtype + + def __str__(self): + return f"{self._dtype} {self._name}" + + @property + def name(self) -> str: + return self._name + + @property + def dtype(self) -> ScalarTypes | VectorTypes: + return self._dtype + + +def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets: + match vtype: + case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d: + return InstructionSets.SSE3 + case VectorTypes.AVX_256 | VectorTypes.AVX_256d: + return InstructionSets.AVX + case VectorTypes.AVX_512 | VectorTypes.AVX_512d: + return InstructionSets.AVX512 + case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2: + return InstructionSets.NEON + + +def intrin_prefix(instruction_set: InstructionSets, double_prec: bool): + match instruction_set: + case InstructionSets.SSE3: + return "_mm" + case InstructionSets.AVX: + return "_mm256" + case InstructionSets.AVX512: + return "_mm512" + case InstructionSets.NEON: + return "vgetq" if double_prec else "vget" + case _: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def intrin_suffix(instruction_set: InstructionSets, double_prec: bool): + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + return "pd" if double_prec else "ps" + elif instruction_set in [InstructionSets.NEON]: + return "f64" if double_prec else "f32" + else: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str): + return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})" + + +def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset): + return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})" + + +def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str): + return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})" + + +def generate_cvts_intrin(double_prec: bool, v: str): + convert_suffix = "f64" if double_prec else "f32" + intrin_suffix = "d" if double_prec else "s" + return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})" + + +def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps): + prefix = intrin_prefix(instruction_set, double_prec) + suffix = intrin_suffix(instruction_set, double_prec) + return f"{prefix}_horizontal_{op.op_name}_{suffix}" + + +def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable): + double_prec = svar.dtype is ScalarTypes.Double + return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n" + + +# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling +def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable): + reduction_op = ReductionOps.Add + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + sname = scalar_var.name + vtype = vector_var.dtype + vname = vector_var.name + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + + # function body + body = f"\t{vtype} _v = {vname};\n" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\treturn {sname} + {cvts(hadd('_v'))};\n" + else: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n" + else: + add_i = "_mm_hadd_ps(_i,_i)" + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\t__m128 _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \ + f"\treturn {sname} + {cvts(add_i)};\n" + + case _: + raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}") + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable): + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + # generate specialized version for add operation + if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]: + return generate_simd_horizontal_add(scalar_var, vector_var) + + sname = scalar_var.name + stype = scalar_var.dtype + vtype = vector_var.dtype + vname = vector_var.name + + opname = reduction_op.op_name + opstr = reduction_op.op_str + + reduction_function = f"f{opname}" \ + if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset) + + # function body + body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else "" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n" + else: + body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \ + f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n" + else: + body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \ + f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX512: + suffix = intrin_suffix(instruction_set, double_prec) + body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n" + + case InstructionSets.NEON: + if double_prec: + body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n" + else: + body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n" + else: + body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \ + f"\t{stype} _r = vgetq_lane_f32(_w,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n" + else: + body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n" + + case _: + raise ValueError(f"Unsupported instruction set {instruction_set}") + + # finalize reduction + if reduction_function: + body += f"\treturn {reduction_function}(_r, {sname});\n" + else: + body += f"\treturn {sname} {opstr} _r;\n" + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +stypes = { + True: ScalarTypes.Double, + False: ScalarTypes.Float +} + +vtypes_for_instruction_set = { + InstructionSets.SSE3: { + True: VectorTypes.SSE3_128d, + False: VectorTypes.SSE3_128 + }, + InstructionSets.AVX: { + True: VectorTypes.AVX_256d, + False: VectorTypes.AVX_256 + }, + InstructionSets.AVX512: { + True: VectorTypes.AVX_512d, + False: VectorTypes.AVX_512 + }, + InstructionSets.NEON: { + True: VectorTypes.NEON_64x2, + False: VectorTypes.NEON_32x4 + }, +} + +guards_for_instruction_sets = { + InstructionSets.SSE3: "__SSE3__", + InstructionSets.AVX: "__AVX__", + InstructionSets.AVX512: '__AVX512F__', + InstructionSets.NEON: '_M_ARM64', +} + +code = """#pragma once + +#include <cmath> + +""" + +for instruction_set in InstructionSets: + code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n" + + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + code += "#include <immintrin.h>\n\n" + elif instruction_set == InstructionSets.NEON: + code += "#include <arm_neon.h>\n\n" + else: + ValueError(f"Missing header include for instruction set {instruction_set}") + + for reduction_op in ReductionOps: + for double_prec in [True, False]: + scalar_var = Variable("dst", stypes[double_prec]) + vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec]) + + code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n" + + code += "#endif\n\n" + +print(code) -- GitLab