From a4acc9ffa4766c287e34e27de310c5588bf66d66 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 29 Jan 2025 10:23:01 +0100
Subject: [PATCH 1/2] Add sqrt; fix domains in function testsuite; update
 freeze of sp.Pow

 - Add the `sqrt` function to the IR, as well as to CPU, CUDA, SYCL, and x86-vector targets
 - Rewrite case for `sp.Power` in freeze to generate `sqrt` calls where appropriate
 - Adapt function test suite: Test on values within the correct domain for all functions
 - Untrack some testsuite output files
 - Remove some deprecated imports from the testsuite
---
 src/pystencils/backend/functions.py           |   1 +
 .../backend/kernelcreation/freeze.py          |  73 ++++++++------
 src/pystencils/backend/platforms/cuda.py      |   1 +
 .../backend/platforms/generic_cpu.py          |   3 +-
 src/pystencils/backend/platforms/sycl.py      |   1 +
 src/pystencils/backend/platforms/x86.py       |   3 +
 tests/kernelcreation/test_functions.py        |  92 ++++++++++++++++--
 tests/kernelcreation/test_staggered_kernel.py |   2 +-
 tests/nbackend/kernelcreation/test_freeze.py  |  84 ++++++++++++++--
 tests/runtime/test_boundary.py                |   2 +-
 tests/runtime/test_data/.gitignore            |   1 +
 .../datahandling_parallel_save_test/dst.dat   | Bin 304 -> 0 bytes
 .../datahandling_parallel_save_test/src.dat   | Bin 304 -> 0 bytes
 .../test_data/datahandling_save_test.npz      | Bin 428 -> 0 bytes
 tests/runtime/test_datahandling.py            |   2 +-
 15 files changed, 212 insertions(+), 53 deletions(-)
 create mode 100644 tests/runtime/test_data/.gitignore
 delete mode 100644 tests/runtime/test_data/datahandling_parallel_save_test/dst.dat
 delete mode 100644 tests/runtime/test_data/datahandling_parallel_save_test/src.dat
 delete mode 100644 tests/runtime/test_data/datahandling_save_test.npz

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 388160f30..f3d18f349 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -78,6 +78,7 @@ class MathFunctions(Enum):
     ASin = ("asin", 1)
     ACos = ("acos", 1)
     ATan = ("atan", 1)
+    Sqrt = ("sqrt", 1)
 
     Abs = ("abs", 1)
     Floor = ("floor", 1)
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 44ee17077..c38fcbc97 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -212,40 +212,49 @@ class FreezeExpressions:
         base = expr.args[0]
         exponent = expr.args[1]
 
-        base_frozen = self.visit_expr(base)
-        reciprocal = False
-        expand_product = False
-
-        if exponent.is_Integer:
-            if exponent == 0:
-                return PsExpression.make(PsConstant(1))
-
-            if exponent.is_negative:
-                reciprocal = True
-                exponent = -exponent
-
-            if exponent <= sp.Integer(
-                5
-            ):  # TODO: is this a sensible limit? maybe make this configurable.
-                expand_product = True
-
-        if expand_product:
-            frozen_expr = reduce(
-                mul,
-                [base_frozen]
-                + [base_frozen.clone() for _ in range(0, int(exponent) - 1)],
-            )
-        else:
-            exponent_frozen = self.visit_expr(exponent)
-            frozen_expr = PsMathFunction(MathFunctions.Pow)(
-                base_frozen, exponent_frozen
-            )
+        expr_frozen = self.visit_expr(base)
+        
+        if isinstance(exponent, sp.Rational):
+            #   Decompose rational exponent
+            num: int = exponent.numerator
+            denom: int = exponent.denominator
 
-        if reciprocal:
-            one = PsExpression.make(PsConstant(1))
-            frozen_expr = one / frozen_expr
+            if denom <= 2 and abs(num) <= 8:
+                #   At most a square root, and at most eight factors
 
-        return frozen_expr
+                reciprocal = False
+
+                if num < 0:
+                    reciprocal = True
+                    num = -num
+
+                if denom == 2:
+                    expr_frozen = PsMathFunction(MathFunctions.Sqrt)(expr_frozen)
+                    denom = 1
+
+                assert denom == 1
+            
+                #   Pairwise multiplication for logarithmic runtime
+                factors = [expr_frozen] + [expr_frozen.clone() for _ in range(num - 1)]
+                while len(factors) > 1:
+                    combined = [x * y for x, y in zip(factors[::2], factors[1::2])]
+                    if len(factors) % 2 == 1:
+                        combined.append(factors[-1])
+                    factors = combined
+
+                expr_frozen = factors.pop()
+
+                if reciprocal:
+                    one = PsExpression.make(PsConstant(1))
+                    expr_frozen = one / expr_frozen
+
+                return expr_frozen
+        
+        #   If we got this far, use pow
+        exponent_frozen = self.visit_expr(exponent)
+        expr_frozen = PsMathFunction(MathFunctions.Pow)(expr_frozen, exponent_frozen)
+
+        return expr_frozen
 
     def map_Integer(self, expr: sp.Integer) -> PsConstantExpr:
         value = int(expr)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index f146cfbfd..d9cd9352e 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -89,6 +89,7 @@ class CudaPlatform(GenericGpu):
                     | MathFunctions.Log
                     | MathFunctions.Sin
                     | MathFunctions.Cos
+                    | MathFunctions.Sqrt
                     | MathFunctions.Ceil
                     | MathFunctions.Floor
                 ) if dtype.width in (16, 32, 64):
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index b6d7dd551..fa8e54002 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -43,7 +43,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>"}
+        return {"<cmath>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -78,6 +78,7 @@ class GenericCpu(Platform):
                     | MathFunctions.ATan
                     | MathFunctions.ATan2
                     | MathFunctions.Pow
+                    | MathFunctions.Sqrt
                     | MathFunctions.Floor
                     | MathFunctions.Ceil
                 ):
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 9c04d6074..92ff68e21 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -80,6 +80,7 @@ class SyclPlatform(GenericGpu):
                     | MathFunctions.ATan
                     | MathFunctions.ATan2
                     | MathFunctions.Pow
+                    | MathFunctions.Sqrt
                     | MathFunctions.Floor
                     | MathFunctions.Ceil
                 ):
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 7d2fe650f..fff143300 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -202,6 +202,9 @@ class X86VectorCpu(GenericVectorCpu):
                 opstr = expr.function.func.function_name
                 if vtype.width > 256:
                     raise MaterializationError("512bit ceil/floor require SVML.")
+                
+            case MathFunctions.Sqrt if vtype.is_float():
+                opstr = expr.function.name
 
             case MathFunctions.Min | MathFunctions.Max:
                 opstr = expr.function.func.function_name
diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py
index 9b7dd2852..6f82f5aa0 100644
--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
@@ -28,6 +28,7 @@ def unary_function(name, xp):
         "asin": (sp.asin, xp.arcsin),
         "acos": (sp.acos, xp.arccos),
         "atan": (sp.atan, xp.arctan),
+        "sqrt": (sp.sqrt, xp.sqrt),
         "abs": (sp.Abs, xp.abs),
         "floor": (sp.floor, xp.floor),
         "ceil": (sp.ceiling, xp.ceil),
@@ -46,6 +47,82 @@ def binary_function(name, xp):
 AVAIL_TARGETS = Target.available_targets()
 
 
+@pytest.fixture
+def function_domain(function_name, dtype):
+    eps = 1e-6
+    rng = np.random.default_rng()
+
+    match function_name:
+        case (
+            "exp" | "sin" | "cos" | "sinh" | "cosh" | "atan" | "abs" | "floor" | "ceil"
+        ):
+            return np.concat(
+                [
+                    [0.0, -1.0, 1.0],
+                    rng.uniform(-0.1, 0.1, 5),
+                    rng.uniform(-5.0, 5.0, 8),
+                    rng.uniform(-10, 10, 8),
+                ]
+            ).astype(dtype)
+        case "tan":
+            return np.concat(
+                [
+                    [0.0, -1.0, 1.0],
+                    rng.uniform(-np.pi / 2.0 + eps, np.pi / 2.0 - eps, 13),
+                ]
+            ).astype(dtype)
+        case "asin" | "acos":
+            return np.concat(
+                [
+                    [0.0, 1.0, -1.0],
+                    rng.uniform(-1.0, 1.0, 13),
+                ]
+            ).astype(dtype)
+        case "log" | "sqrt":
+            return np.concat(
+                [
+                    [1.0],
+                    rng.uniform(eps, 0.1, 7),
+                    rng.uniform(eps, 1.0, 8),
+                    rng.uniform(eps, 1e6, 8),
+                ]
+            ).astype(dtype)
+        case "min" | "max" | "atan2":
+            return np.concat(
+                [
+                    rng.uniform(-0.1, 0.1, 8),
+                    rng.uniform(-5.0, 5.0, 8),
+                    rng.uniform(-10, 10, 8),
+                ]
+            ).astype(dtype), np.concat(
+                [
+                    rng.uniform(-0.1, 0.1, 8),
+                    rng.uniform(-5.0, 5.0, 8),
+                    rng.uniform(-10, 10, 8),
+                ]
+            ).astype(
+                dtype
+            )
+        case "pow":
+            return np.concat(
+                [
+                    [0., 1., 1.],
+                    rng.uniform(-1., 1., 8),
+                    rng.uniform(0., 5., 8),
+                ]
+            ).astype(dtype), np.concat(
+                [
+                    [1., 0., 2.],
+                    np.arange(2., 10., 1.),
+                    rng.uniform(-2.0, 2.0, 8),
+                ]
+            ).astype(
+                dtype
+            )
+        case _:
+            assert False, "I don't know the domain of that function"
+
+
 @pytest.mark.parametrize(
     "function_name, target",
     list(
@@ -70,15 +147,15 @@ AVAIL_TARGETS = Target.available_targets()
             ["floor", "ceil"], [t for t in AVAIL_TARGETS if Target._AVX512 not in t]
         )
     )
-    + list(product(["abs"], AVAIL_TARGETS)),
+    + list(product(["sqrt", "abs"], AVAIL_TARGETS)),
 )
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
-def test_unary_functions(gen_config, xp, function_name, dtype):
+def test_unary_functions(gen_config, xp, function_name, dtype, function_domain):
     sp_func, xp_func = unary_function(function_name, xp)
     resolution = np.finfo(dtype).resolution
 
     #   Array size should be larger than eight, such that vectorized kernels don't just run their remainder loop
-    inp = xp.array([0.1, 0.2, 0.0, -0.8, -1.6, -12.592, xp.pi, xp.e, -0.3], dtype=dtype)
+    inp = xp.array(function_domain)
     outp = xp.zeros_like(inp)
 
     reference = xp_func(inp)
@@ -104,15 +181,12 @@ def test_unary_functions(gen_config, xp, function_name, dtype):
     ),
 )
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
-def test_binary_functions(gen_config, xp, function_name, dtype):
+def test_binary_functions(gen_config, xp, function_name, dtype, function_domain):
     sp_func, xp_func = binary_function(function_name, xp)
     resolution: dtype = np.finfo(dtype).resolution
 
-    inp = xp.array([0.1, 0.2, 0.3, -0.8, -1.6, -12.592, xp.pi, xp.e, 0.0], dtype=dtype)
-    inp2 = xp.array(
-        [3.1, -0.5, 21.409, 11.0, 1.0, -14e3, 2.0 * xp.pi, -xp.e, 0.0],
-        dtype=dtype,
-    )
+    inp = xp.array(function_domain[0])
+    inp2 = xp.array(function_domain[1])
     outp = xp.zeros_like(inp)
 
     reference = xp_func(inp, inp2)
diff --git a/tests/kernelcreation/test_staggered_kernel.py b/tests/kernelcreation/test_staggered_kernel.py
index 9bc9e71af..99b61d07f 100644
--- a/tests/kernelcreation/test_staggered_kernel.py
+++ b/tests/kernelcreation/test_staggered_kernel.py
@@ -5,7 +5,7 @@ import pytest
 
 import pystencils as ps
 from pystencils import x_staggered_vector, TypedSymbol
-from pystencils.enums import Target
+from pystencils import Target
 
 
 class TestStaggeredDiffusion:
diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index ce4f61785..fccec7114 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -113,14 +113,8 @@ def test_freeze_fields():
 
     zero = PsExpression.make(PsConstant(0))
 
-    lhs = PsBufferAcc(
-        f_arr.base_pointer,
-        (PsExpression.make(counter) + zero, zero)
-    )
-    rhs = PsBufferAcc(
-        g_arr.base_pointer,
-        (PsExpression.make(counter) + zero, zero)
-    )
+    lhs = PsBufferAcc(f_arr.base_pointer, (PsExpression.make(counter) + zero, zero))
+    rhs = PsBufferAcc(g_arr.base_pointer, (PsExpression.make(counter) + zero, zero))
 
     should = PsAssignment(lhs, rhs)
 
@@ -357,6 +351,80 @@ def test_add_sub():
     assert expr.structurally_equal(PsAdd(x2, PsMul(minus_two, y2)))
 
 
+def test_powers():
+    ctx = KernelCreationContext()
+    freeze = FreezeExpressions(ctx)
+
+    x, y, z = sp.symbols("x, y, z")
+
+    x2 = PsExpression.make(ctx.get_symbol("x"))
+    y2 = PsExpression.make(ctx.get_symbol("y"))
+
+    #   Integer powers
+    expr = freeze(x**2)
+    assert expr.structurally_equal(x2 * x2)
+
+    expr = freeze(x**3)
+    assert expr.structurally_equal(x2 * x2 * x2)
+
+    expr = freeze(x**4)
+    assert expr.structurally_equal((x2 * x2) * (x2 * x2))
+
+    expr = freeze(x**5)
+    assert expr.structurally_equal((x2 * x2) * (x2 * x2) * x2)
+
+    #   Negative integer powers
+    one = PsExpression.make(PsConstant(1))
+
+    expr = freeze(x**-2)
+    assert expr.structurally_equal(one / (x2 * x2))
+
+    expr = freeze(x**-3)
+    assert expr.structurally_equal(one / (x2 * x2 * x2))
+
+    expr = freeze(x**-4)
+    assert expr.structurally_equal(one / ((x2 * x2) * (x2 * x2)))
+
+    expr = freeze(x**-5)
+    assert expr.structurally_equal(one / ((x2 * x2) * (x2 * x2) * x2))
+
+    #   Integer powers of the square root
+    sqrt = PsMathFunction(MathFunctions.Sqrt)
+
+    expr = freeze(x ** sp.Rational(1, 2))
+    assert expr.structurally_equal(sqrt(x2))
+
+    expr = freeze(x ** sp.Rational(2, 2))
+    assert expr.structurally_equal(x2)
+
+    expr = freeze(x ** sp.Rational(3, 2))
+    assert expr.structurally_equal(sqrt(x2) * sqrt(x2) * sqrt(x2))
+
+    expr = freeze(x ** sp.Rational(4, 2))
+    assert expr.structurally_equal(x2 * x2)
+
+    expr = freeze(x ** sp.Rational(5, 2))
+    assert expr.structurally_equal(
+        (sqrt(x2) * sqrt(x2)) * (sqrt(x2) * sqrt(x2)) * sqrt(x2)
+    )
+
+    #   Negative integer powers of sqrt
+    expr = freeze(x ** sp.Rational(-1, 2))
+    assert expr.structurally_equal(one / sqrt(x2))
+
+    expr = freeze(x ** sp.Rational(-3, 2))
+    assert expr.structurally_equal(one / (sqrt(x2) * sqrt(x2) * sqrt(x2)))
+
+    #   Cube root
+    pow = PsMathFunction(MathFunctions.Pow)
+    expr = freeze(x ** sp.Rational(1, 3))
+    assert expr.structurally_equal(pow(x2, freeze(sp.Rational(1, 3))))
+
+    #   Unknown exponent
+    expr = freeze(x**y)
+    assert expr.structurally_equal(pow(x2, y2))
+
+
 def test_tuple_array_literals():
     ctx = KernelCreationContext()
     freeze = FreezeExpressions(ctx)
diff --git a/tests/runtime/test_boundary.py b/tests/runtime/test_boundary.py
index a94d37820..fb8f827e8 100644
--- a/tests/runtime/test_boundary.py
+++ b/tests/runtime/test_boundary.py
@@ -8,7 +8,7 @@ import pystencils
 from pystencils import Assignment, create_kernel
 from pystencils.boundaries import BoundaryHandling, Dirichlet, Neumann, add_neumann_boundary
 from pystencils.datahandling import SerialDataHandling
-from pystencils.enums import Target
+from pystencils import Target
 from pystencils.slicing import slice_from_direction
 from pystencils.timeloop import TimeLoop
 
diff --git a/tests/runtime/test_data/.gitignore b/tests/runtime/test_data/.gitignore
new file mode 100644
index 000000000..278bd1136
--- /dev/null
+++ b/tests/runtime/test_data/.gitignore
@@ -0,0 +1 @@
+datahandling_save_test*
\ No newline at end of file
diff --git a/tests/runtime/test_data/datahandling_parallel_save_test/dst.dat b/tests/runtime/test_data/datahandling_parallel_save_test/dst.dat
deleted file mode 100644
index 204552486f77c485a4dd333a10eff82f9d44aa9f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 304
YcmWe&fB*$X2n``W*pJen5Woxv0M%S7&Hw-a

diff --git a/tests/runtime/test_data/datahandling_parallel_save_test/src.dat b/tests/runtime/test_data/datahandling_parallel_save_test/src.dat
deleted file mode 100644
index 204552486f77c485a4dd333a10eff82f9d44aa9f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 304
YcmWe&fB*$X2n``W*pJen5Woxv0M%S7&Hw-a

diff --git a/tests/runtime/test_data/datahandling_save_test.npz b/tests/runtime/test_data/datahandling_save_test.npz
deleted file mode 100644
index 22202358a4fa1d1cea4db89c0889f5bca636598b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 428
zcmWIWW@Zs#U|`^2U|>*W=q~1yie+G6U|?lnU}q3vC@xCY%PXj4WDo!gfn>na3=9mj
z--y4G7C3n#;8?)gd6S~%#4O2Mx*%_I+QM~<7tEU$9}+ZWhWPyWDU-N__%*eBuLcI_
zpYzc12;ClZ!Qf(`!NtH0SFQxE2vra+Fl;!;{yVVBM8@s%!3?{E1;us@4ESA@Qd}}X
yu3}^oVa63&pumCP21XDK4Z8quVl_=;U|@iHEx?<V4P*ur0~13Q0|SE&*hBzalx#cz

diff --git a/tests/runtime/test_datahandling.py b/tests/runtime/test_datahandling.py
index 29e639c88..62ba64056 100644
--- a/tests/runtime/test_datahandling.py
+++ b/tests/runtime/test_datahandling.py
@@ -7,7 +7,7 @@ import numpy as np
 import pystencils as ps
 from pystencils import create_data_handling, create_kernel
 from pystencils.gpu.gpu_array_handler import GPUArrayHandler
-from pystencils.enums import Target
+from pystencils import Target
 
 try:
     import pytest
-- 
GitLab


From ecbbdf72fe3338c223124453d01bd2fb92864fa6 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 29 Jan 2025 10:35:44 +0100
Subject: [PATCH 2/2] use np.concatenate

---
 tests/kernelcreation/test_functions.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py
index 6f82f5aa0..a4d154d4b 100644
--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
@@ -56,7 +56,7 @@ def function_domain(function_name, dtype):
         case (
             "exp" | "sin" | "cos" | "sinh" | "cosh" | "atan" | "abs" | "floor" | "ceil"
         ):
-            return np.concat(
+            return np.concatenate(
                 [
                     [0.0, -1.0, 1.0],
                     rng.uniform(-0.1, 0.1, 5),
@@ -65,21 +65,21 @@ def function_domain(function_name, dtype):
                 ]
             ).astype(dtype)
         case "tan":
-            return np.concat(
+            return np.concatenate(
                 [
                     [0.0, -1.0, 1.0],
                     rng.uniform(-np.pi / 2.0 + eps, np.pi / 2.0 - eps, 13),
                 ]
             ).astype(dtype)
         case "asin" | "acos":
-            return np.concat(
+            return np.concatenate(
                 [
                     [0.0, 1.0, -1.0],
                     rng.uniform(-1.0, 1.0, 13),
                 ]
             ).astype(dtype)
         case "log" | "sqrt":
-            return np.concat(
+            return np.concatenate(
                 [
                     [1.0],
                     rng.uniform(eps, 0.1, 7),
@@ -88,13 +88,13 @@ def function_domain(function_name, dtype):
                 ]
             ).astype(dtype)
         case "min" | "max" | "atan2":
-            return np.concat(
+            return np.concatenate(
                 [
                     rng.uniform(-0.1, 0.1, 8),
                     rng.uniform(-5.0, 5.0, 8),
                     rng.uniform(-10, 10, 8),
                 ]
-            ).astype(dtype), np.concat(
+            ).astype(dtype), np.concatenate(
                 [
                     rng.uniform(-0.1, 0.1, 8),
                     rng.uniform(-5.0, 5.0, 8),
@@ -104,13 +104,13 @@ def function_domain(function_name, dtype):
                 dtype
             )
         case "pow":
-            return np.concat(
+            return np.concatenate(
                 [
                     [0., 1., 1.],
                     rng.uniform(-1., 1., 8),
                     rng.uniform(0., 5., 8),
                 ]
-            ).astype(dtype), np.concat(
+            ).astype(dtype), np.concatenate(
                 [
                     [1., 0., 2.],
                     np.arange(2., 10., 1.),
-- 
GitLab