From c50b5b934cd4ca064ef7ccd4109316ba9b4854eb Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 19 Feb 2026 13:57:49 +0100
Subject: [PATCH 1/5] Updated radix division code; more helper functions; code
 for approximate square roots of small integers

---
 doc/source/radix.rst              |  30 +++++-
 src/radix.h                       |  14 +++
 src/radix/div.c                   | 146 ++++++++++++++++++++++++++----
 src/radix/integer.c               |  51 +++++++++--
 src/radix/mul_1.c                 | 114 +++++++++++++++++++++++
 src/radix/mulmid_classical.c      |   1 -
 src/radix/profile/p-tdiv_q.c      | 107 ++++++++++++++++++++++
 src/radix/sqrt.c                  | 129 ++++++++++++++++++++++++++
 src/radix/test/main.c             |   4 +
 src/radix/test/t-divrem.c         |  29 +++++-
 src/radix/test/t-divrem_1.c       |  13 +++
 src/radix/test/t-mul_1.c          |  72 +++++++++++++++
 src/radix/test/t-rsqrt_1_approx.c |  88 ++++++++++++++++++
 13 files changed, 769 insertions(+), 29 deletions(-)
 create mode 100644 src/radix/mul_1.c
 create mode 100644 src/radix/profile/p-tdiv_q.c
 create mode 100644 src/radix/sqrt.c
 create mode 100644 src/radix/test/t-mul_1.c
 create mode 100644 src/radix/test/t-rsqrt_1_approx.c

diff --git a/doc/source/radix.rst b/doc/source/radix.rst
index 0f32c03c07..161130937b 100644
--- a/doc/source/radix.rst
+++ b/doc/source/radix.rst
@@ -107,6 +107,15 @@ Except where otherwise noted, the following rules apply:
     Sets *(res, xn)* to the sum or difference of *(x, xn)* and *(y, yn)*, returning
     carry or borrow. Requires `xn \ge yn \ge 1`.
 
+.. function:: ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix)
+
+    Sets *(res, xn)* to *(x, xn)* multiplied by *c*, returning
+    carry out. Requires `1 \le d \le c`.
+
+.. function:: ulong radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+
+    Sets *(res, xn)* to *(x, xn)* multiplied by 2, returning carry out.
+
 .. function:: void radix_mul(nn_ptr res, nn_srcptr x, slong xn, nn_srcptr y, slong yn, const radix_t radix)
 
     Sets *(res, xn + yn)* to the product of *(x, xn)* and *(y, yn)*.
@@ -163,6 +172,11 @@ Except where otherwise noted, the following rules apply:
     Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d*, returning the
     remainder. Requires `1 \le d \le B - 1`.
 
+.. function:: ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+
+    Sets *(res, xn)* to the quotient of *(x, xn)* divided by 2, returning the
+    remainder.
+
 .. function:: void radix_divexact_1(nn_ptr res, nn_srcptr x, slong xn, ulong d, const radix_t radix)
 
     Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d* assuming that
@@ -176,6 +190,8 @@ Except where otherwise noted, the following rules apply:
     `a \in [1/B, 1)` with `an` fraction limbs,
     sets `(q, n+2)` to an approximation of `1/a \in (1, B]` with `n` fraction limbs
     and two integral limbs (the highest limb may be zero).
+    The relative error is bounded by `4 B^{-n}`, i.e. the absolute error is bounded
+    by `4 B^{-n} / a`.
 
 .. function:: void radix_divrem_via_mpn(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
               void radix_divrem_newton(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
@@ -183,14 +199,17 @@ Except where otherwise noted, the following rules apply:
 
     Sets `(q,an-bn+1)` to the quotient and `(r,bn)` to the remainder of
     `(a,an)` divided by `(b,bn)`. Requires `an \ge bn \ge 1` and
-    `b_{bn-1} \ne 0`.
+    `b_{bn-1} \ne 0`. The user can pass ``NULL`` for `r` to compute
+    only the quotient.
 
 .. function:: void radix_divrem_preinv(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, nn_srcptr binv, slong binvn, const radix_t radix)
 
     Similar to :func:`radix_divrem`, but accepts a precomputed inverse
     of `b` given as `(b, binvn+2)` with `binvn` fraction limbs and two
     integral limbs, as computed by :func:`radix_inv_approx`.
-    Currently requires that `binvn \ge an-bn+1`.
+    Currently requires that `binvn \ge an-bn+1`. Passing an inverse with
+    several extra limbs can improve performance. The user can pass ``NULL``
+    for `r` to compute only the quotient.
 
 .. function:: int radix_div(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
 
@@ -214,6 +233,13 @@ Except where otherwise noted, the following rules apply:
     Returns -1, 0 or 1 according to whether *(x, n)* is less, equal
     or greater than `\lfloor B^n / 2 \rfloor`. We require *n* to be positive.
 
+.. function:: void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix)
+              void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix)
+
+    Sets `(res,n)` to the fractional limbs of an approximation of `1 / \sqrt{a}`.
+    Assumes that `2 \le a < B`. The error is bounded by `2 B^{-n}`, i.e. by
+    2 fixed-point ulps.
+
 Radix conversion
 --------------------------------------------------------------------------------
 
diff --git a/src/radix.h b/src/radix.h
index 2af5de3971..12a24d11c9 100644
--- a/src/radix.h
+++ b/src/radix.h
@@ -79,6 +79,8 @@ radix_sub_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix)
 
 /* Multiplication */
 
+ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix);
+
 void radix_mulmid_fft_small(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
 void radix_mulmid_classical(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
 void radix_mulmid_KS(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
@@ -111,10 +113,17 @@ radix_sqr(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
     radix_mul(res, a, an, a, an, radix);
 }
 
+RADIX_INLINE ulong
+radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+{
+    return radix_add(res, a, an, a, an, radix);
+}
+
 /* Division */
 
 ulong radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix);
 void radix_divexact_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix);
+ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix);
 
 void radix_inv_approx_basecase(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix);
 void radix_inv_approx(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix);
@@ -155,6 +164,11 @@ radix_cmp_bn_half(nn_srcptr x, slong n, const radix_t radix)
     return 0;
 }
 
+/* Square roots */
+
+void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix);
+void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix);
+
 /* Radix conversion */
 
 typedef struct
diff --git a/src/radix/div.c b/src/radix/div.c
index 1b59768619..7ff5a33bb1 100644
--- a/src/radix/div.c
+++ b/src/radix/div.c
@@ -12,6 +12,29 @@
 #include "longlong.h"
 #include "radix.h"
 
+ulong
+radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+{
+    slong i;
+    ulong q, r, hi, lo, B = LIMB_RADIX(radix);
+
+    q = a[an - 1] >> 1;
+    r = a[an - 1] & 1;
+    res[an - 1] = q;
+
+    for (i = an - 2; i >= 0; i--)
+    {
+        umul_ppmm(hi, lo, r, B);
+        add_ssaaaa(hi, lo, hi, lo, 0, a[i]);
+        q = (hi << (FLINT_BITS - 1)) | (lo >> 1);
+        r = lo & 1;
+        res[i] = q;
+    }
+
+    return r;
+}
+
+
 /* todo: optimise */
 ulong
 radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix)
@@ -154,6 +177,33 @@ radix_inv_approx_basecase(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_
 /* Must be at least 4 */
 #define RADIX_INV_NEWTON_CUTOFF 8
 
+/*
+Claim: the absolute error is bounded by 4*B^(-n)/A. This is not tight.
+
+Proof sketch: let m = ceil(n/2) + 1  + [B <= 3].
+
+The initial approximation is assumed to satisfy
+
+    Y = 1/A + e1   where  e1 <= C*B^(-m)/A   where we take C = 4.
+
+We compute Y' = Y + (1 - Y * (A + e2) + e3) * Y + e4 where
+
+    e2 = B^(-n-1) is the error from truncating A,
+
+    e3 = B^(-n) + {(m+2)*B^(-n-1)}
+    e4 = B^(-n) + {(m+2)*B^(-n-1)}
+
+are the errors from fixed-point multiplications where the term in curly
+brackets appears only when we use mulhigh, when B > 2 * (m + 2) is satisfied.
+
+Expanding the expression for Y' gives
+
+    |Y' - 1/A| <= A*e1^2 + e1^2*e2 + e1*e3 + e4 - 2*e1*e2/A + e3/A + e2/A^2
+
+Verifying that the RHS satisfies the claimed bound is now straightforward
+(but exhausting) numerics.
+*/
+
 void
 radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
 {
@@ -173,7 +223,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
         Compute T ~= 1 / A as a fixed-point number with m fraction limbs,
         2 integral limbs and store in the high part of Q.
         */
-        m = n / 2 + 1 + (LIMB_RADIX(radix) == 2);
+        m = (n + 1) / 2 + 1 + (LIMB_RADIX(radix) <= 3);
         radix_inv_approx(Q + n - m, A, An, m, radix);
 
         TMP_START;
@@ -232,7 +282,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
         // product.
         slong low_trunc_with_mulhigh = m - 1;
 
-        if (LIMB_RADIX(radix) > m + 2 && A_low_zeroes <= low_trunc_with_mulhigh)
+        if (LIMB_RADIX(radix) > 2 * (m + 2) && A_low_zeroes <= low_trunc_with_mulhigh)
         {
             U = TMP_ALLOC((2 + (control_limb + 1)) * sizeof(ulong));
             radix_mulmid(U, Ahigh, Ahighn, T, Tn,
@@ -275,7 +325,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
         if (Uhighn != 0)
         {
             /* Compute V = T * U with n fraction limbs. */
-            if (LIMB_RADIX(radix) > m + 2)
+            if (LIMB_RADIX(radix) > 2 * (m + 2))
             {
                 // V = T * |1 - A' * T| with n+2 fraction limbs
                 V = TMP_ALLOC((Tn + Uhighn - (m - 2)) * sizeof(ulong));
@@ -304,6 +354,15 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
     }
 }
 
+/* Instead of computing Q with ~1 ulp error and doing a full multiplication
+   to check the remainder, compute a few fraction limbs. If the first
+   fraction limb is not too close to the limb boundary, we have the correct
+   quotient and the remainder can be determined by a low multiplication
+   (or omitted if we only want the quotient). */
+#define USE_PREINV2 1
+/* Must be >= 2 */
+#define PREINV2_EXTRA_LIMBS 2
+
 void
 radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, nn_srcptr Binv, slong Binvn, const radix_t radix)
 {
@@ -319,17 +378,61 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
     if (Binvn < n)
         flint_throw(FLINT_ERROR, "radix_divrem_preinv: inverse has too few limbs");
 
-    if (LIMB_RADIX(radix) > n + 2)
+    slong n2 = n + PREINV2_EXTRA_LIMBS;
+
+    /*
+        Consider A, B as fixed-point numbers with A in [0,1), 1/B in [1/beta,1),
+        i.e. the integer quotient is floor((A/B)*beta^(an-bn)).
+
+        B' = High n2 limbs of Binv = 1/B * (1 + e1)  where e1 <= 4*beta^(-n2)
+        e2 <= beta^(-n2)  [truncation error for A]
+        e3 <= n2*beta^(-n2+1)  [truncation error for product]
+
+        (A + e2) * B' + e3 - A/B   = [A/B*e1 + e3 + e1*e2/B + e2/B]
+                                  <= [(e1 + e1*e2 + e2)*beta + e3]
+                                   < [(n2 + 6) beta^(-n2+1)]
+
+        PREINV2_EXTRA_LIMBS >= 2 ensures that q[-1] has <1 ulp error.
+    */
+#if USE_PREINV2
+    if (Binvn >= n2 && An >= n2 && LIMB_RADIX(radix) > n2 + 6)
     {
-        U = TMP_ALLOC((n + 3) * sizeof(ulong));
-        radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix);
-        q = U + 2;
+        U = TMP_ALLOC((n2 + 2) * sizeof(ulong));
+        /* n2 fraction limbs x (n2 fraction limbs + 2 integral limbs) */
+        radix_mulmid(U,  A + An - n2, n2, Binv + Binvn - n2, n2 + 2, n2, 2 * n2 + 2, radix);
+
+        FLINT_ASSERT(U[n2 + 1] == 0);
+        q = U + n2 + 2 - (n + 1);
+
+        if (q[-1] > 1 && q[-1] < LIMB_RADIX(radix) - 1)
+        {
+            if (R == NULL)
+                r = NULL;
+            else
+            {
+                T = TMP_ALLOC(Bn * sizeof(ulong));
+                r = T;
+                radix_mulmid(r, q, FLINT_MIN(Bn, n + 1), B, Bn, 0, Bn, radix);
+                radix_sub(r, A, Bn, r, Bn, radix);
+            }
+            goto done;
+        }
     }
     else
+#endif
     {
-        U = TMP_ALLOC((2 * n + 2) * sizeof(ulong));
-        radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix);
-        q = U + An - Bn + 2;
+        if (LIMB_RADIX(radix) > n + 2)
+        {
+            U = TMP_ALLOC((n + 3) * sizeof(ulong));
+            radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix);
+            q = U + 2;
+        }
+        else
+        {
+            U = TMP_ALLOC((2 * n + 2) * sizeof(ulong));
+            radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix);
+            q = U + An - Bn + 2;
+        }
     }
 
     T = TMP_ALLOC((Bn + n) * sizeof(ulong));
@@ -354,8 +457,10 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
         radix_sub(r, r, Bn + 1, B, Bn, radix);
     }
 
+done:
     flint_mpn_copyi(Q, q, An - Bn + 1);
-    flint_mpn_copyi(R, r, Bn);
+    if (R != NULL)
+        flint_mpn_copyi(R, r, Bn);
 
     TMP_END;
 }
@@ -370,15 +475,21 @@ radix_divrem_newton(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
 
     if (flint_mpn_zero_p(A + Bn, An - Bn) && mpn_cmp(A, B, Bn) < 0)
     {
-        flint_mpn_copyi(R, A, Bn);
+        if (R != NULL)
+            flint_mpn_copyi(R, A, Bn);
         flint_mpn_zero(Q, An - Bn + 1);
         return;
     }
 
     n = An - Bn + 1;
-    T = TMP_ALLOC((n + 2) * sizeof(ulong));
+    T = TMP_ALLOC((n + PREINV2_EXTRA_LIMBS + 2) * sizeof(ulong));
+#if USE_PREINV2
+    radix_inv_approx(T, B, Bn, n + PREINV2_EXTRA_LIMBS, radix);
+    radix_divrem_preinv(Q, R, A, An, B, Bn, T, n + PREINV2_EXTRA_LIMBS, radix);
+#else
     radix_inv_approx(T, B, Bn, n, radix);
     radix_divrem_preinv(Q, R, A, An, B, Bn, T, n, radix);
+#endif
     TMP_END;
     return;
 }
@@ -392,11 +503,13 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c
 
     if (bn == 1)
     {
-        r[0] = radix_divrem_1(q, a, an, b[0], radix);
+        ulong r0 = radix_divrem_1(q, a, an, b[0], radix);
+        if (r != NULL)
+            r[0] = r0;
         return;
     }
 
-    if (bn >= 48)
+    if ((bn >= 20 && (an - bn + 1) <= 70) || bn >= 80)
     {
         radix_divrem_newton(q, r, a, an, b, bn, radix);
         return;
@@ -438,7 +551,8 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c
             cy = q[i * bn];
         }
 
-        flint_mpn_copyi(r, R, bn);
+        if (r != NULL)
+            flint_mpn_copyi(r, R, bn);
         TMP_END;
         return;
     }
diff --git a/src/radix/integer.c b/src/radix/integer.c
index d6f5969985..d3e7197c12 100644
--- a/src/radix/integer.c
+++ b/src/radix/integer.c
@@ -862,6 +862,48 @@ void radix_integer_tdiv_qr(radix_integer_t q, radix_integer_t r,
     r->size = (asize < 0) ? -rn : rn;
 }
 
+void radix_integer_tdiv_q(radix_integer_t q,
+    const radix_integer_t a, const radix_integer_t b, const radix_t radix)
+{
+    slong asize, bsize, an, bn, qn;
+    nn_srcptr ad, bd;
+    nn_ptr qd;
+
+    asize = a->size;
+    bsize = b->size;
+
+    an = FLINT_ABS(asize);
+    bn = FLINT_ABS(bsize);
+
+    if (bn == 0)
+        flint_throw(FLINT_DIVZERO, "radix_integer_tdiv_qr: divide by zero");
+
+    if (an == 0)
+    {
+        radix_integer_zero(q, radix);
+        return;
+    }
+
+    if (an < bn)
+    {
+        radix_integer_zero(q, radix);
+        return;
+    }
+
+    qn = an - bn + 1;
+
+    ad = a->d;
+    bd = b->d;
+
+    qd = radix_integer_fit_limbs(q, qn, radix);
+
+    radix_divrem(qd, NULL, ad, an, bd, bn, radix);
+
+    MPN_NORM(qd, qn);
+
+    q->size = ((asize < 0) == (bsize < 0)) ? qn : -qn;
+}
+
 void radix_integer_fdiv_qr(radix_integer_t q, radix_integer_t r,
     const radix_integer_t a, const radix_integer_t b, const radix_t radix)
 {
@@ -1003,15 +1045,6 @@ void radix_integer_cdiv_qr(radix_integer_t q, radix_integer_t r,
 }
 
 /* Todo: optimize */
-void
-radix_integer_tdiv_q(radix_integer_t q,
-    const radix_integer_t a, const radix_integer_t b, const radix_t radix)
-{
-    radix_integer_t r;
-    radix_integer_init(r, radix);
-    radix_integer_tdiv_qr(q, r, a, b, radix);
-    radix_integer_clear(r, radix);
-}
 
 void
 radix_integer_fdiv_q(radix_integer_t q,
diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c
new file mode 100644
index 0000000000..6ab3ce8576
--- /dev/null
+++ b/src/radix/mul_1.c
@@ -0,0 +1,114 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "radix.h"
+
+ulong
+radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
+{
+    slong i;
+    ulong hi, lo;
+    slong sbits;
+    nmod_t mod = radix->B;
+
+    FLINT_ASSERT(n >= 1);
+    FLINT_ASSERT(c < mod.n);
+
+    sbits = 2 * NMOD_BITS(mod);
+
+    if (sbits <= FLINT_BITS)
+    {
+        ulong cy = 0;
+        FLINT_ASSERT(mod.norm != 0);
+
+        for (i = 0; i < n; i++)
+        {
+            cy += a[i] * c;
+            z[i] = n_divrem_preinv_unnorm(&cy, cy, mod.n, mod.ninv, mod.norm);
+        }
+
+        FLINT_ASSERT(cy < mod.n);
+        return cy;
+    }
+    else if (sbits <= 2 * FLINT_BITS)
+    {
+        ulong cy[2] = { 0, 0, };
+
+        if (mod.norm == 0)
+        {
+            for (i = 0; i < n; i++)
+            {
+                umul_ppmm(hi, lo, a[i], c);
+                add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
+                z[i] = flint_mpn_divrem_2_1_preinv_norm(cy, cy, mod.n, mod.ninv);
+            }
+        }
+        else
+        {
+            for (i = 0; i < n; i++)
+            {
+                umul_ppmm(hi, lo, a[i], c);
+                add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
+                z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
+            }
+        }
+
+        FLINT_ASSERT(cy[0] < mod.n);
+        FLINT_ASSERT(cy[1] == 0);
+        return cy[0];
+    }
+    else
+    {
+        if (mod.norm == 0)
+        {
+            ulong r;
+            ulong cy0 = 0;
+            ulong cy1 = 0;
+            ulong cy2 = 0;
+
+            umul_ppmm(hi, lo, a[0], c);
+            udiv_qrnnd_preinv(cy0, r, hi, lo, mod.n, mod.ninv);
+            z[0] = r;
+
+            for (i = 1; i < n; i++)
+            {
+                umul_ppmm(hi, lo, a[i], c);
+                add_sssaaaaaa(cy2, cy1, cy0, cy2, cy1, cy0, 0, hi, lo);
+                /* cy2 is certainly already reduced */
+                udiv_qrnnd_preinv(cy1, r, cy2, cy1, mod.n, mod.ninv);
+                udiv_qrnnd_preinv(cy0, r, r, cy0, mod.n, mod.ninv);
+                cy2 = 0;
+                z[i] = r;
+            }
+
+            FLINT_ASSERT(cy0 < mod.n);
+            FLINT_ASSERT(cy1 == 0);
+            return cy0;
+        }
+        else
+        {
+            ulong cy[3] = { 0, 0, 0 };
+
+            for (i = 0; i < n; i++)
+            {
+                umul_ppmm(hi, lo, a[i], c);
+                add_sssaaaaaa(cy[2], cy[1], cy[0], cy[2], cy[1], cy[0], 0, hi, lo);
+                z[i] = flint_mpn_divrem_3_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
+            }
+
+            FLINT_ASSERT(cy[0] < mod.n);
+            FLINT_ASSERT(cy[1] == 0);
+            FLINT_ASSERT(cy[2] == 0);
+            return cy[0];
+        }
+    }
+}
+
diff --git a/src/radix/mulmid_classical.c b/src/radix/mulmid_classical.c
index 6bcf13006e..b415fa2bcc 100644
--- a/src/radix/mulmid_classical.c
+++ b/src/radix/mulmid_classical.c
@@ -242,7 +242,6 @@ radix_sqrmid_classical(nn_ptr z, nn_srcptr a, slong an, slong zlo, slong zhi, co
     }
 }
 
-
 void
 radix_mulmid_classical(nn_ptr z, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong zlo, slong zhi, const radix_t radix)
 {
diff --git a/src/radix/profile/p-tdiv_q.c b/src/radix/profile/p-tdiv_q.c
new file mode 100644
index 0000000000..af99553fb0
--- /dev/null
+++ b/src/radix/profile/p-tdiv_q.c
@@ -0,0 +1,107 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <math.h>
+#include "profiler.h"
+#include "radix.h"
+#include "fmpz.h"
+
+/* there is no flint_mpn_tdiv_q yet, so mock one up */
+void
+flint_mpn_tdiv_q(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn)
+{
+    if (bn < 1000)
+    {
+        mpn_tdiv_qr(q, r, 0, a, an, b, bn);
+    }
+    else
+    {
+        fmpz_t fa, fb, fq;
+
+        fmpz_init(fa);
+        fmpz_init(fb);
+        fmpz_init(fq);
+
+        fmpz_set_ui_array(fa, a, an);
+        fmpz_set_ui_array(fb, b, bn);
+
+        fmpz_tdiv_q(fq, fa, fb);
+
+        //fmpz_get_ui_array(q, an - bn + 1, fq);
+        //fmpz_get_ui_array(r, bn, fr);
+
+        fmpz_clear(fa);
+        fmpz_clear(fb);
+        fmpz_clear(fq);
+    }
+}
+
+int main()
+{
+    radix_t radix;
+    slong digits, nd, n1, n2;
+    nn_ptr a, b, c, d;
+    double tmpn, tradix, FLINT_SET_BUT_UNUSED(tt);
+
+    flint_rand_t state;
+    flint_rand_init(state);
+
+    flint_printf("   decimal      mpn  decimal        time        time    relative\n");
+    flint_printf("    digits    limbs    limbs flint_mpn_tdiv_q radix_tdiv_q  time\n\n");
+
+    for (nd = 1; nd <= 100000000; nd = FLINT_MAX(nd + 1, nd * 1.5))
+    {
+        radix_init(radix, 10, 0);
+
+        digits = nd * 19;
+
+        /* Number of full-word limbs */
+        n1 = (slong) (digits * (log(10) / (FLINT_BITS * log(2))) + 1.0);
+        /* Number of radix 10^e limbs */
+        n2 = (digits + radix->exp - 1) / radix->exp;
+
+        a = flint_malloc(2 * n2 * sizeof(ulong));
+        b = flint_malloc(n2 * sizeof(ulong));
+        c = flint_malloc((n2 + 1) * sizeof(ulong));
+        d = flint_malloc(n2 * sizeof(ulong));
+
+        flint_mpn_urandomb(a, state, 2 * n1 * FLINT_BITS);
+        flint_mpn_urandomb(b, state, n1 * FLINT_BITS);
+
+        flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1);
+        TIMEIT_START;
+        flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1);
+        TIMEIT_STOP_VALUES(tt, tmpn);
+
+        radix_rand_limbs(a, state, 2 * n2, radix);
+        radix_rand_limbs(b, state, n2, radix);
+
+        radix_divrem(c, NULL, a, 2 * n2, b, n2, radix);
+        TIMEIT_START;
+        radix_divrem(c, NULL, a, 2 * n2, b, n2, radix);
+        TIMEIT_STOP_VALUES(tt, tradix);
+
+        flint_printf("%10wd %8wd %8wd    %8g    %8g    %.3fx\n",
+            digits, n1, n2, tmpn, tradix, tradix / tmpn);
+
+        flint_free(a);
+        flint_free(b);
+        flint_free(c);
+        flint_free(d);
+
+        radix_clear(radix);
+    }
+
+    flint_rand_clear(state);
+    flint_cleanup_master();
+    return 0;
+}
+
diff --git a/src/radix/sqrt.c b/src/radix/sqrt.c
new file mode 100644
index 0000000000..3c7812fbf0
--- /dev/null
+++ b/src/radix/sqrt.c
@@ -0,0 +1,129 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "radix.h"
+
+void radix_rsqrt_1_approx_basecase(nn_ptr Y, ulong a, slong n, const radix_t radix)
+{
+    nn_ptr U, V;
+    slong nU, nV;
+    TMP_INIT;
+
+    TMP_START;
+    U = TMP_ALLOC((4 * n + 2) * sizeof(ulong));
+    V = U + 2 * n + 1;
+
+    flint_mpn_zero(U, 2 * n);
+    U[2 * n] = a;
+
+    nV = radix_get_mpn(V, U, 2 * n + 1, radix);
+    mpn_sqrtrem(U, NULL, V, nV);
+    nU = (nV + 1) / 2;
+    MPN_NORM(U, nU);
+    mpn_divrem_1(U, 0, U, nU, a);
+    MPN_NORM(U, nU);
+
+    nV = radix_set_mpn(V, U, nU, radix);
+
+    flint_mpn_copyi(Y, V, nV);
+    flint_mpn_zero(Y + nV, n - nV);
+
+    TMP_END;
+}
+
+// todo: allocs
+// todo: fast mul_1
+// todo: fast divrem_two
+
+/*
+The error is smaller than 2 ulp, and for large radix, 1+eps ulp. Proof sketch:
+suppose on input y = 1/sqrt(a) + eps where we assume that |eps| <= C*B^(-m)
+for some C < 2. The mathematical error after the Newton iteration is
+
+    1/sqrt(a) - 3*sqrt(a)/2 * eps^2 - a/2 * eps^3.
+
+The calculation of 1 - a*y^2 is done exactly and generates a fixed-point number
+with 2m-limb precision. Subsequently dividing by 2 generates up to B^(-2m)
+error. Rounding the result of the final multiplication by y generates up to
+B^(-n) error, plus B^(-2m) error if mulhigh is used.
+
+Note that m >= n/2 + 1. Setting e.g. C = 1.7 one can check explicitly that
+the error is bounded by C*B^(-n) in the worst case B = 3.
+For larger B one can choose C closer to 1.
+*/
+
+void radix_rsqrt_1_approx(nn_ptr Y, ulong a, slong n, const radix_t radix)
+{
+    if (n <= 4)
+    {
+        radix_rsqrt_1_approx_basecase(Y, a, n, radix);
+    }
+    else
+    {
+        slong m, Un;
+        ulong cy;
+        nn_ptr T, U;
+        slong Talloc, Ualloc;
+        nn_ptr Yhi, Thi;
+        TMP_INIT;
+
+        m = (n + 1) / 2 + 1;
+
+        Yhi = Y + n - m;
+        radix_rsqrt_1_approx(Yhi, a, m, radix);
+        flint_mpn_zero(Y, n - m);
+
+        /* TODO: reduce temporary space */
+        if (LIMB_RADIX(radix) > m)
+            Talloc = m + 3;   /* In case of high product */
+        else
+            Talloc = 2 * m + 2;   /* In case of full product */
+        Ualloc = m + 2;
+
+        TMP_START;
+        T = TMP_ALLOC((Talloc + Ualloc) * sizeof(ulong));
+        U = T + Talloc;
+
+        /* T = Yhi^2 with 2m fraction limbs */
+        radix_mulmid(T, Yhi, m, Yhi, m, 0, m + 2, radix);
+        radix_mulmid(U, T, m + 2, &a, 1, 0, m + 2, radix);
+        cy = (U[m + 1] == 0);
+        /* a*y^2-1 or 1-a*y^2 */
+        if (!cy)
+            radix_neg(U, U, m + 2, radix);
+
+        radix_divrem_two(U, U, m + 2, radix);
+        Un = m + 2;
+        MPN_NORM(U, Un);
+        if (Un == 0)
+            goto cleanup;
+
+        if (LIMB_RADIX(radix) > m)
+        {
+            radix_mulmid(T, Yhi, m, U, Un, m - 2, Un + m, radix);
+            Thi = T + 2;
+        }
+        else
+        {
+            radix_mulmid(T, Yhi, m, U, Un, 0, Un + m, radix);
+            Thi = T + m;
+        }
+
+        if (cy)
+            radix_sub(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix);
+        else
+            radix_add(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix);
+
+cleanup:
+        TMP_END;
+    }
+}
+
diff --git a/src/radix/test/main.c b/src/radix/test/main.c
index 4d33bd358c..d5bb3ed9c6 100644
--- a/src/radix/test/main.c
+++ b/src/radix/test/main.c
@@ -18,10 +18,12 @@
 #include "t-integer.c"
 #include "t-inv_approx.c"
 #include "t-invmod_bn.c"
+#include "t-mul_1.c"
 #include "t-mulmid_classical.c"
 #include "t-mulmid_fft_small.c"
 #include "t-mulmid_KS.c"
 #include "t-neg.c"
+#include "t-rsqrt_1_approx.c"
 #include "t-set_mpn.c"
 #include "t-sub.c"
 
@@ -36,10 +38,12 @@ test_struct tests[] =
     TEST_FUNCTION(radix_integer),
     TEST_FUNCTION(radix_inv_approx),
     TEST_FUNCTION(radix_invmod_bn),
+    TEST_FUNCTION(radix_mul_1),
     TEST_FUNCTION(radix_mulmid_classical),
     TEST_FUNCTION(radix_mulmid_fft_small),
     TEST_FUNCTION(radix_mulmid_KS),
     TEST_FUNCTION(radix_neg),
+    TEST_FUNCTION(radix_rsqrt_1_approx),
     TEST_FUNCTION(radix_set_mpn),
     TEST_FUNCTION(radix_sub),
 };
diff --git a/src/radix/test/t-divrem.c b/src/radix/test/t-divrem.c
index 2532425750..ce72875069 100644
--- a/src/radix/test/t-divrem.c
+++ b/src/radix/test/t-divrem.c
@@ -19,7 +19,7 @@ TEST_FUNCTION_START(radix_divrem, state)
     for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
     {
         radix_t radix;
-        nn_ptr a, b, q, r, bq, bqr;
+        nn_ptr a, b, q, q2, r, bq, bqr;
         slong an, bn, qn, rn;
         int algorithm, aliasing;
 
@@ -35,6 +35,7 @@ TEST_FUNCTION_START(radix_divrem, state)
         a = flint_malloc(an * sizeof(ulong));
         b = flint_malloc(bn * sizeof(ulong));
         q = flint_malloc(qn * sizeof(ulong));
+        q2 = flint_malloc(qn * sizeof(ulong));
         r = flint_malloc(rn * sizeof(ulong));
         bq = flint_malloc(an * sizeof(ulong));
         bqr = flint_malloc(an * sizeof(ulong));
@@ -123,11 +124,37 @@ TEST_FUNCTION_START(radix_divrem, state)
             flint_abort();
         }
 
+        aliasing = n_randint(state, 2);
+
+        if (aliasing == 0)
+        {
+            divrem_func(q2, NULL, a, an, b, bn, radix);
+        }
+        else
+        {
+            q2 = flint_realloc(q2, an * sizeof(ulong));
+            flint_mpn_copyi(q2, a, an);
+            divrem_func(q2, NULL, q2, an, b, bn, radix);
+        }
+
+        if (mpn_cmp(q, q2, qn) != 0)
+        {
+            flint_printf("FAIL: radix_divrem (r == NULL)\n");
+            flint_printf("algorithm = %d\n", algorithm);
+            flint_printf("aliasing = %d\n", aliasing);
+            flint_printf("a = %{ulong*}\n", a, an);
+            flint_printf("b = %{ulong*}\n", b, bn);
+            flint_printf("q = %{ulong*}\n", q, qn);
+            flint_printf("q2 = %{ulong*}\n", q2, qn);
+            flint_abort();
+        }
+
         radix_clear(radix);
 
         flint_free(a);
         flint_free(b);
         flint_free(q);
+        flint_free(q2);
         flint_free(r);
         flint_free(bq);
         flint_free(bqr);
diff --git a/src/radix/test/t-divrem_1.c b/src/radix/test/t-divrem_1.c
index ad095494de..d134e3ab3d 100644
--- a/src/radix/test/t-divrem_1.c
+++ b/src/radix/test/t-divrem_1.c
@@ -108,6 +108,19 @@ TEST_FUNCTION_START(radix_divrem_1, state)
             flint_abort();
         }
 
+        r1 = radix_divrem_two(c, a, an, radix);
+        r2 = radix_divrem_1_naive(d, a, an, 2, radix);
+
+        if (r1 != r2 || mpn_cmp(c, d, an) != 0)
+        {
+            flint_printf("FAIL: divrem_two\n");
+            flint_printf("%{ulong*}\n", a, an);
+            flint_printf("%{ulong*}\n", c, an);
+            flint_printf("%{ulong*}\n", d, an);
+            flint_printf("%wu\n%wu\n\n", r1, r2);
+            flint_abort();
+        }
+
         radix_clear(radix);
 
         flint_free(a);
diff --git a/src/radix/test/t-mul_1.c b/src/radix/test/t-mul_1.c
new file mode 100644
index 0000000000..f51021c68b
--- /dev/null
+++ b/src/radix/test/t-mul_1.c
@@ -0,0 +1,72 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "radix.h"
+
+TEST_FUNCTION_START(radix_mul_1, state)
+{
+    slong iter;
+
+    for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
+    {
+        radix_t radix;
+        nn_ptr a, b, c;
+        ulong x;
+        slong n;
+        ulong cy;
+
+        radix_init_randtest(radix, state);
+
+        n = 1 + n_randint(state, 5);
+
+        a = flint_malloc(n * sizeof(ulong));
+        b = flint_malloc(n * sizeof(ulong));
+        c = flint_malloc((n + 1) * sizeof(ulong));
+
+        radix_randtest_limbs(a, state, n, radix);
+        radix_randtest_limbs(b, state, n, radix);
+        radix_randtest_limbs(c, state, n + 1, radix);
+        radix_randtest_limbs(&x, state, 1, radix);
+
+        if (n_randint(state, 2))
+        {
+            cy = radix_mul_1(b, a, n, x, radix);
+        }
+        else
+        {
+            flint_mpn_copyi(b, a, n);
+            cy = radix_mul_1(b, b, n, x, radix);
+        }
+
+        radix_mulmid_naive(c, a, n, &x, 1, 0, n + 1, radix);
+
+        if (mpn_cmp(b, c, n) != 0 || cy != c[n])
+        {
+            flint_printf("FAIL: mul_1\n");
+            flint_printf("radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n);
+            flint_printf("a = %{ulong*}\n", a, n);
+            flint_printf("b = %{ulong*}\n", b, n);
+            flint_printf("c = %{ulong*}\n", c, n + 1);
+            flint_printf("x = %wu\n", x);
+            flint_printf("cy = %wu\n", cy);
+            flint_abort();
+        }
+
+        radix_clear(radix);
+
+        flint_free(a);
+        flint_free(b);
+        flint_free(c);
+    }
+
+    TEST_FUNCTION_END(state);
+}
diff --git a/src/radix/test/t-rsqrt_1_approx.c b/src/radix/test/t-rsqrt_1_approx.c
new file mode 100644
index 0000000000..c598bfe213
--- /dev/null
+++ b/src/radix/test/t-rsqrt_1_approx.c
@@ -0,0 +1,88 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <math.h>
+#include "test_helpers.h"
+#include "radix.h"
+
+TEST_FUNCTION_START(radix_rsqrt_1_approx, state)
+{
+    slong iter;
+
+    for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
+    {
+        nn_ptr A, B, err;
+        ulong a;
+        slong n, errn, err_limbs = 10;
+        double dbl_err, dbl_powB;
+        slong j;
+
+        radix_t radix;
+        radix_init_randtest(radix, state);
+
+        if (n_randint(state, 100))
+            n = 1 + n_randint(state, 50);
+        else
+            n = 1 + n_randint(state, 500);
+
+        a = n_randint(state, LIMB_RADIX(radix));
+
+        if (a > 1)
+        {
+            A = flint_malloc(sizeof(ulong) * (n + err_limbs));
+            B = flint_malloc(sizeof(ulong) * (n + err_limbs));
+            err = flint_malloc(sizeof(ulong) * (n + err_limbs));
+
+            radix_randtest_limbs(A, state, n, radix);
+            radix_randtest_limbs(B, state, n, radix);
+
+            radix_rsqrt_1_approx_basecase(A, a, n + err_limbs, radix);
+            flint_mpn_zero(B, n + err_limbs);
+            flint_mpn_zero(B, err_limbs);
+            radix_rsqrt_1_approx(B + err_limbs, a, n, radix);
+
+            if (mpn_cmp(A, B, n + err_limbs) >= 0)
+                radix_sub(err, A, n + err_limbs, B, n + err_limbs, radix);
+            else
+                radix_sub(err, B, n + err_limbs, B, n + err_limbs, radix);
+
+            errn = n + err_limbs;
+            MPN_NORM(err, errn);
+
+            dbl_err = 0.0;
+            dbl_powB = pow(LIMB_RADIX(radix), -err_limbs);
+            for (j = 0; j < errn; j++)
+            {
+                dbl_err += dbl_powB * err[j];
+                dbl_powB *= LIMB_RADIX(radix);
+            }
+
+            if (dbl_err > 1.7)
+            {
+                flint_printf("FAIL: too large error: radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n);
+                flint_printf("a = %wu\n", a);
+                flint_printf("A = %{ulong*}\n", A, n + err_limbs);
+                flint_printf("B = %{ulong*}\n", B, n + err_limbs);
+                flint_printf("err = %{ulong*}\n", err, n + err_limbs);
+                flint_printf("dbl_err: %f\n", dbl_err);
+                flint_abort();
+            }
+
+            flint_free(A);
+            flint_free(B);
+            flint_free(err);
+        }
+
+        radix_clear(radix);
+    }
+
+    TEST_FUNCTION_END(state);
+}

From e0cea1d26c2fee3cc4d32f862c8993ee61d37283 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 19 Feb 2026 15:32:06 +0100
Subject: [PATCH 2/5] Avoid add of length <= 0

---
 src/radix/sqrt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/radix/sqrt.c b/src/radix/sqrt.c
index 3c7812fbf0..175099980f 100644
--- a/src/radix/sqrt.c
+++ b/src/radix/sqrt.c
@@ -103,7 +103,7 @@ void radix_rsqrt_1_approx(nn_ptr Y, ulong a, slong n, const radix_t radix)
         radix_divrem_two(U, U, m + 2, radix);
         Un = m + 2;
         MPN_NORM(U, Un);
-        if (Un == 0)
+        if (Un + n - 2 * m <= 0)
             goto cleanup;
 
         if (LIMB_RADIX(radix) > m)

From 21167816bef31acc59e6adf4ef70ff8514425094 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 19 Feb 2026 17:13:32 +0100
Subject: [PATCH 3/5] Radix constants profile program

---
 src/radix/profile/p-constants.c | 305 ++++++++++++++++++++++++++++++++
 1 file changed, 305 insertions(+)
 create mode 100644 src/radix/profile/p-constants.c

diff --git a/src/radix/profile/p-constants.c b/src/radix/profile/p-constants.c
new file mode 100644
index 0000000000..55486d8c6c
--- /dev/null
+++ b/src/radix/profile/p-constants.c
@@ -0,0 +1,305 @@
+/*
+    Copyright (C) 2026 Fredrik Johansson
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include <string.h>
+#include "arb.h"
+#include "radix.h"
+#include "double_extras.h"
+#include "mag.h"
+#include "profiler.h"
+#include "hypgeom.h"
+#include "fmpz_poly.h"
+
+/* Don't call arb_const_e because it caches the result. */
+static void
+arb_const_e_eval(arb_t s, slong prec)
+{
+    hypgeom_t series;
+    arb_t t;
+
+    arb_init(t);
+    hypgeom_init(series);
+
+    fmpz_poly_set_str(series->A, "1  1");
+    fmpz_poly_set_str(series->B, "1  1");
+    fmpz_poly_set_str(series->P, "1  1");
+    fmpz_poly_set_str(series->Q, "2  0 1");
+
+    prec += FLINT_CLOG2(prec);
+    arb_hypgeom_infsum(s, t, series, prec, prec);
+    arb_div(s, s, t, prec);
+
+    hypgeom_clear(series);
+    arb_clear(t);
+}
+
+char * radix_integer_get_str_decimal(char * s, const radix_integer_t x, const radix_t radix)
+{
+    return radix_get_str_decimal(s, x->d, FLINT_ABS(x->size), x->size < 0, radix);
+}
+
+slong radix_digits_to_limbs(slong n, const radix_t radix)
+{
+    return (n + radix->exp - 1) / radix->exp;
+}
+
+void
+print_condensed(const char * s, slong ndigits, slong n)
+{
+    char * tmp = flint_malloc(n + 1);
+
+    strncpy(tmp, s, 1);
+    tmp[1] = '\0';
+    flint_printf("%s.", tmp);
+
+    strncpy(tmp, s + 1, n);
+    tmp[n] = '\0';
+
+    flint_printf("%s{...%wd digits...}%s\n", tmp, ndigits - 2 * n - 1, s + ndigits - n);
+
+    flint_free(tmp);
+}
+
+/* Calculate sqrt(a) as a fixed-point number with n fraction limbs. */
+void
+radix_integer_fixed_sqrt_1(radix_integer_t res, ulong a, slong n, const radix_t radix)
+{
+    nn_ptr rd;
+    rd = radix_integer_fit_limbs(res, n + 1, radix);
+    radix_rsqrt_1_approx(rd, a, n, radix);
+    if (a == 2)
+        rd[n] = radix_mul_two(rd, rd, n, radix);
+    else
+        rd[n] = radix_mul_1(rd, rd, n, a, radix);
+    res->size = n + 1;
+}
+
+#define BSPLIT_E_MPN_BASECASE 2
+
+static void
+bsplit_e(radix_integer_t P, radix_integer_t Q, slong a, slong b, const radix_t radix)
+{
+    if (b - a == 1)
+    {
+        radix_integer_one(P, radix);
+        radix_integer_set_ui(Q, b, radix);
+    }
+    else if (b - a == 2 && b < (UWORD(1) << (FLINT_BITS / 2)))
+    {
+        radix_integer_set_ui(P, a + 3, radix);
+        radix_integer_set_ui(Q, (a + 1) * (a + 2), radix);
+    }
+    else if (b - a <= BSPLIT_E_MPN_BASECASE && b < (UWORD(1) << (FLINT_BITS / 2)))
+    {
+        ulong p[BSPLIT_E_MPN_BASECASE];
+        ulong q[BSPLIT_E_MPN_BASECASE];
+        ulong p2, q2;
+        slong pn, qn, k;
+
+        p[0] = a + 3;
+        q[0] = (a + 1) * (a + 2);
+        pn = qn = 1;
+
+        for (k = a + 2; k < b; k++)
+        {
+            if (k + 1 < b) { p2 = k + 3; q2 = (k + 1) * (k + 2); k++; }
+            else           { p2 = 1; q2 = k + 1; }
+
+            p[pn] = mpn_mul_1(p, p, pn, q2);
+            p[pn] += mpn_add_1(p, p, pn, p2);
+            pn += (p[pn] != 0);
+            q[qn] = mpn_mul_1(q, q, qn, q2);
+            qn += (q[qn] != 0);
+        }
+
+        P->size = radix_set_mpn(radix_integer_fit_limbs(P,
+            radix_set_mpn_need_alloc(pn, radix), radix), p, pn, radix);
+        Q->size = radix_set_mpn(radix_integer_fit_limbs(Q,
+            radix_set_mpn_need_alloc(qn, radix), radix), q, qn, radix);
+    }
+    else
+    {
+        slong m = a + (b - a) / 2;
+
+        radix_integer_t P1, Q1, P2, Q2;
+
+        radix_integer_init(P1, radix);
+        radix_integer_init(Q1, radix);
+        radix_integer_init(P2, radix);
+        radix_integer_init(Q2, radix);
+
+        bsplit_e(P1, Q1, a, m, radix);
+        bsplit_e(P2, Q2, m, b, radix);
+
+        radix_integer_mul(P, P1, Q2, radix);
+        radix_integer_add(P, P, P2, radix);
+        radix_integer_mul(Q, Q1, Q2, radix);
+
+        radix_integer_clear(P1, radix);
+        radix_integer_clear(Q1, radix);
+        radix_integer_clear(P2, radix);
+        radix_integer_clear(Q2, radix);
+    }
+}
+
+void
+radix_integer_fixed_e(radix_integer_t res, slong n, const radix_t radix)
+{
+    slong N;
+    radix_integer_t P, Q;
+
+    /* Select truncation N */
+    {
+        mag_t one, eps, err;
+
+        double logB = log(LIMB_RADIX(radix));
+        N = n * logB / d_lambertw(n * logB * exp(-1.0)) + 1;
+
+        mag_init(one);
+        mag_init(eps);
+        mag_init(err);
+
+        mag_one(one);
+        mag_set_ui(eps, LIMB_RADIX(radix));
+        mag_inv_lower(eps, eps);
+        mag_pow_ui_lower(eps, eps, n);
+
+        while (1)
+        {
+            mag_exp_tail(err, one, N + 1);
+            if (mag_cmp(err, eps) <= 0)
+                break;
+            else
+                N++;
+        }
+
+        mag_clear(one);
+        mag_clear(err);
+        mag_clear(eps);
+    }
+
+    radix_integer_init(P, radix);
+    radix_integer_init(Q, radix);
+
+    bsplit_e(P, Q, 1, N, radix);
+    radix_integer_lshift_limbs(P, P, n, radix);
+    radix_integer_tdiv_q(res, P, Q, radix);
+    radix_integer_set_limb(res, res, n, 2, radix);
+
+    radix_integer_clear(P, radix);
+    radix_integer_clear(Q, radix);
+}
+
+int main()
+{
+    slong ndigits;
+    radix_t radix;
+    char * s;
+
+    radix_init(radix, 10, 0);
+
+    flint_set_num_threads(1);
+
+    for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10)
+    {
+        radix_integer_t x;
+        arb_t y;
+
+        slong prec_bits = 3.32192809488736 * ndigits + 1;
+        slong prec_radix = radix_digits_to_limbs(ndigits, radix);
+
+        flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n",
+            n_clog(ndigits, 10),
+            prec_bits, prec_radix, prec_radix * radix->exp);
+
+        arb_init(y);
+        flint_printf("Arb compute sqrt(2):     ");
+        TIMEIT_START;
+        arb_sqrt_ui(y, 2, prec_bits);
+        TIMEIT_STOP;
+
+        flint_printf("Arb to string:           ");
+        s = NULL;
+        TIMEIT_START;
+        flint_free(s);
+        s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20);
+        TIMEIT_STOP;
+
+        flint_printf("%s\n", s);
+        arb_clear(y);
+
+        radix_integer_init(x, radix);
+        flint_printf("Radix compute sqrt(2):   ");
+        TIMEIT_START;
+        radix_integer_fixed_sqrt_1(x, 2, prec_radix, radix);
+        TIMEIT_STOP;
+
+        flint_printf("Radix to string:         ");
+        s = NULL;
+        TIMEIT_START;
+        flint_free(s);
+        s = radix_integer_get_str_decimal(NULL, x, radix);
+        TIMEIT_STOP;
+
+        print_condensed(s, ndigits, 20);
+        radix_integer_clear(x, radix);
+    }
+
+    for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10)
+    {
+        radix_integer_t x;
+        arb_t y;
+
+        slong prec_bits = 3.32192809488736 * ndigits + 1;
+        slong prec_radix = radix_digits_to_limbs(ndigits, radix);
+
+        flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n",
+            n_clog(ndigits, 10),
+            prec_bits, prec_radix, prec_radix * radix->exp);
+
+        arb_init(y);
+        flint_printf("Arb compute e:     ");
+        TIMEIT_START;
+        arb_const_e_eval(y, prec_bits);
+        TIMEIT_STOP;
+
+        flint_printf("Arb to string:     ");
+        s = NULL;
+        TIMEIT_START;
+        flint_free(s);
+        s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20);
+        TIMEIT_STOP;
+
+        flint_printf("%s\n", s);
+        arb_clear(y);
+
+        radix_integer_init(x, radix);
+        flint_printf("Radix compute e:   ");
+        TIMEIT_START;
+        radix_integer_fixed_e(x, prec_radix, radix);
+        TIMEIT_STOP;
+
+        flint_printf("Radix to string:   ");
+        s = NULL;
+        TIMEIT_START;
+        flint_free(s);
+        s = radix_integer_get_str_decimal(NULL, x, radix);
+        TIMEIT_STOP;
+
+        print_condensed(s, ndigits, 20);
+        radix_integer_clear(x, radix);
+    }
+
+    radix_clear(radix);
+    flint_cleanup_master();
+    return 0;
+}
+

From bbc2c953ae028d7c8dbfa74e057bd52397648a5d Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Thu, 19 Feb 2026 22:35:15 +0100
Subject: [PATCH 4/5] Improve radix_mul_1

---
 src/radix/mul_1.c | 49 +++++++----------------------------------------
 1 file changed, 7 insertions(+), 42 deletions(-)

diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c
index 6ab3ce8576..1d2f2d7b4b 100644
--- a/src/radix/mul_1.c
+++ b/src/radix/mul_1.c
@@ -38,54 +38,20 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
         FLINT_ASSERT(cy < mod.n);
         return cy;
     }
-    else if (sbits <= 2 * FLINT_BITS)
-    {
-        ulong cy[2] = { 0, 0, };
-
-        if (mod.norm == 0)
-        {
-            for (i = 0; i < n; i++)
-            {
-                umul_ppmm(hi, lo, a[i], c);
-                add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
-                z[i] = flint_mpn_divrem_2_1_preinv_norm(cy, cy, mod.n, mod.ninv);
-            }
-        }
-        else
-        {
-            for (i = 0; i < n; i++)
-            {
-                umul_ppmm(hi, lo, a[i], c);
-                add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
-                z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
-            }
-        }
-
-        FLINT_ASSERT(cy[0] < mod.n);
-        FLINT_ASSERT(cy[1] == 0);
-        return cy[0];
-    }
     else
     {
         if (mod.norm == 0)
         {
-            ulong r;
             ulong cy0 = 0;
             ulong cy1 = 0;
-            ulong cy2 = 0;
-
-            umul_ppmm(hi, lo, a[0], c);
-            udiv_qrnnd_preinv(cy0, r, hi, lo, mod.n, mod.ninv);
-            z[0] = r;
+            ulong r;
 
-            for (i = 1; i < n; i++)
+            for (i = 0; i < n; i++)
             {
                 umul_ppmm(hi, lo, a[i], c);
-                add_sssaaaaaa(cy2, cy1, cy0, cy2, cy1, cy0, 0, hi, lo);
-                /* cy2 is certainly already reduced */
-                udiv_qrnnd_preinv(cy1, r, cy2, cy1, mod.n, mod.ninv);
+                add_ssaaaa(cy1, cy0, cy1, cy0, hi, lo);
+                r = n_divrem_norm(&cy1, cy1, mod.n);
                 udiv_qrnnd_preinv(cy0, r, r, cy0, mod.n, mod.ninv);
-                cy2 = 0;
                 z[i] = r;
             }
 
@@ -95,18 +61,17 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
         }
         else
         {
-            ulong cy[3] = { 0, 0, 0 };
+            ulong cy[2] = { 0, 0, };
 
             for (i = 0; i < n; i++)
             {
                 umul_ppmm(hi, lo, a[i], c);
-                add_sssaaaaaa(cy[2], cy[1], cy[0], cy[2], cy[1], cy[0], 0, hi, lo);
-                z[i] = flint_mpn_divrem_3_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
+                add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
+                z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
             }
 
             FLINT_ASSERT(cy[0] < mod.n);
             FLINT_ASSERT(cy[1] == 0);
-            FLINT_ASSERT(cy[2] == 0);
             return cy[0];
         }
     }

From 45be05939460ee33b8657e31fec850671c75402f Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Fri, 20 Feb 2026 09:40:14 +0100
Subject: [PATCH 5/5] Inline division code in radix_mul_1

---
 src/radix/mul_1.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c
index 1d2f2d7b4b..e82f0aefdf 100644
--- a/src/radix/mul_1.c
+++ b/src/radix/mul_1.c
@@ -11,6 +11,36 @@
 
 #include "radix.h"
 
+/* Inline version of flint_mpn_divrem_2_1_preinv_unnorm for speed.  */
+FLINT_FORCE_INLINE mp_limb_t
+flint_mpn_divrem_2_1_preinv_unnorm1(mp_ptr qp, mp_srcptr up, mp_limb_t d, mp_limb_t dinv, unsigned int norm)
+{
+    mp_limb_t u0, u1, r;
+
+    FLINT_ASSERT(norm >= 1);
+
+    u1 = up[1];
+    u0 = up[0];
+    if (u1 < d)
+    {
+        d <<= norm;
+        qp[1] = 0;
+        r = (u1 << norm) | (u0 >> (FLINT_BITS - norm));
+    }
+    else
+    {
+        /* Hack: this branch is unreachable, but leaving it in results
+           in faster code with GCC on Zen 3. */
+        d <<= norm;
+        r = (u1 >> (FLINT_BITS - norm));
+        udiv_qrnnd_preinv(qp[1], r, r, (u1 << norm) | (u0 >> (FLINT_BITS - norm)), d, dinv);
+    }
+
+    udiv_qrnnd_preinv(qp[0], r, r, u0 << norm, d, dinv);
+    return r >> norm;
+}
+
+
 ulong
 radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
 {
@@ -67,7 +97,7 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
             {
                 umul_ppmm(hi, lo, a[i], c);
                 add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
-                z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm);
+                z[i] = flint_mpn_divrem_2_1_preinv_unnorm1(cy, cy, mod.n, mod.ninv, mod.norm);
             }
 
             FLINT_ASSERT(cy[0] < mod.n);