diff --git a/doc/source/radix.rst b/doc/source/radix.rst
index 0f32c03c07..161130937b 100644
--- a/doc/source/radix.rst
+++ b/doc/source/radix.rst
@@ -107,6 +107,15 @@ Except where otherwise noted, the following rules apply:
Sets *(res, xn)* to the sum or difference of *(x, xn)* and *(y, yn)*, returning
carry or borrow. Requires `xn \ge yn \ge 1`.
+.. function:: ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix)
+
+ Sets *(res, xn)* to *(x, xn)* multiplied by *c*, returning
+ carry out. Requires `1 \le d \le c`.
+
+.. function:: ulong radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+
+ Sets *(res, xn)* to *(x, xn)* multiplied by 2, returning carry out.
+
.. function:: void radix_mul(nn_ptr res, nn_srcptr x, slong xn, nn_srcptr y, slong yn, const radix_t radix)
Sets *(res, xn + yn)* to the product of *(x, xn)* and *(y, yn)*.
@@ -163,6 +172,11 @@ Except where otherwise noted, the following rules apply:
Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d*, returning the
remainder. Requires `1 \le d \le B - 1`.
+.. function:: ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+
+ Sets *(res, xn)* to the quotient of *(x, xn)* divided by 2, returning the
+ remainder.
+
.. function:: void radix_divexact_1(nn_ptr res, nn_srcptr x, slong xn, ulong d, const radix_t radix)
Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d* assuming that
@@ -176,6 +190,8 @@ Except where otherwise noted, the following rules apply:
`a \in [1/B, 1)` with `an` fraction limbs,
sets `(q, n+2)` to an approximation of `1/a \in (1, B]` with `n` fraction limbs
and two integral limbs (the highest limb may be zero).
+ The relative error is bounded by `4 B^{-n}`, i.e. the absolute error is bounded
+ by `4 B^{-n} / a`.
.. function:: void radix_divrem_via_mpn(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
void radix_divrem_newton(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
@@ -183,14 +199,17 @@ Except where otherwise noted, the following rules apply:
Sets `(q,an-bn+1)` to the quotient and `(r,bn)` to the remainder of
`(a,an)` divided by `(b,bn)`. Requires `an \ge bn \ge 1` and
- `b_{bn-1} \ne 0`.
+ `b_{bn-1} \ne 0`. The user can pass ``NULL`` for `r` to compute
+ only the quotient.
.. function:: void radix_divrem_preinv(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, nn_srcptr binv, slong binvn, const radix_t radix)
Similar to :func:`radix_divrem`, but accepts a precomputed inverse
of `b` given as `(b, binvn+2)` with `binvn` fraction limbs and two
integral limbs, as computed by :func:`radix_inv_approx`.
- Currently requires that `binvn \ge an-bn+1`.
+ Currently requires that `binvn \ge an-bn+1`. Passing an inverse with
+ several extra limbs can improve performance. The user can pass ``NULL``
+ for `r` to compute only the quotient.
.. function:: int radix_div(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
@@ -214,6 +233,13 @@ Except where otherwise noted, the following rules apply:
Returns -1, 0 or 1 according to whether *(x, n)* is less, equal
or greater than `\lfloor B^n / 2 \rfloor`. We require *n* to be positive.
+.. function:: void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix)
+ void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix)
+
+ Sets `(res,n)` to the fractional limbs of an approximation of `1 / \sqrt{a}`.
+ Assumes that `2 \le a < B`. The error is bounded by `2 B^{-n}`, i.e. by
+ 2 fixed-point ulps.
+
Radix conversion
--------------------------------------------------------------------------------
diff --git a/src/radix.h b/src/radix.h
index 2af5de3971..12a24d11c9 100644
--- a/src/radix.h
+++ b/src/radix.h
@@ -79,6 +79,8 @@ radix_sub_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix)
/* Multiplication */
+ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix);
+
void radix_mulmid_fft_small(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
void radix_mulmid_classical(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
void radix_mulmid_KS(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix);
@@ -111,10 +113,17 @@ radix_sqr(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
radix_mul(res, a, an, a, an, radix);
}
+RADIX_INLINE ulong
+radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+{
+ return radix_add(res, a, an, a, an, radix);
+}
+
/* Division */
ulong radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix);
void radix_divexact_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix);
+ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix);
void radix_inv_approx_basecase(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix);
void radix_inv_approx(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix);
@@ -155,6 +164,11 @@ radix_cmp_bn_half(nn_srcptr x, slong n, const radix_t radix)
return 0;
}
+/* Square roots */
+
+void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix);
+void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix);
+
/* Radix conversion */
typedef struct
diff --git a/src/radix/div.c b/src/radix/div.c
index 1b59768619..7ff5a33bb1 100644
--- a/src/radix/div.c
+++ b/src/radix/div.c
@@ -12,6 +12,29 @@
#include "longlong.h"
#include "radix.h"
+ulong
+radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
+{
+ slong i;
+ ulong q, r, hi, lo, B = LIMB_RADIX(radix);
+
+ q = a[an - 1] >> 1;
+ r = a[an - 1] & 1;
+ res[an - 1] = q;
+
+ for (i = an - 2; i >= 0; i--)
+ {
+ umul_ppmm(hi, lo, r, B);
+ add_ssaaaa(hi, lo, hi, lo, 0, a[i]);
+ q = (hi << (FLINT_BITS - 1)) | (lo >> 1);
+ r = lo & 1;
+ res[i] = q;
+ }
+
+ return r;
+}
+
+
/* todo: optimise */
ulong
radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix)
@@ -154,6 +177,33 @@ radix_inv_approx_basecase(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_
/* Must be at least 4 */
#define RADIX_INV_NEWTON_CUTOFF 8
+/*
+Claim: the absolute error is bounded by 4*B^(-n)/A. This is not tight.
+
+Proof sketch: let m = ceil(n/2) + 1 + [B <= 3].
+
+The initial approximation is assumed to satisfy
+
+ Y = 1/A + e1 where e1 <= C*B^(-m)/A where we take C = 4.
+
+We compute Y' = Y + (1 - Y * (A + e2) + e3) * Y + e4 where
+
+ e2 = B^(-n-1) is the error from truncating A,
+
+ e3 = B^(-n) + {(m+2)*B^(-n-1)}
+ e4 = B^(-n) + {(m+2)*B^(-n-1)}
+
+are the errors from fixed-point multiplications where the term in curly
+brackets appears only when we use mulhigh, when B > 2 * (m + 2) is satisfied.
+
+Expanding the expression for Y' gives
+
+ |Y' - 1/A| <= A*e1^2 + e1^2*e2 + e1*e3 + e4 - 2*e1*e2/A + e3/A + e2/A^2
+
+Verifying that the RHS satisfies the claimed bound is now straightforward
+(but exhausting) numerics.
+*/
+
void
radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
{
@@ -173,7 +223,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
Compute T ~= 1 / A as a fixed-point number with m fraction limbs,
2 integral limbs and store in the high part of Q.
*/
- m = n / 2 + 1 + (LIMB_RADIX(radix) == 2);
+ m = (n + 1) / 2 + 1 + (LIMB_RADIX(radix) <= 3);
radix_inv_approx(Q + n - m, A, An, m, radix);
TMP_START;
@@ -232,7 +282,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
// product.
slong low_trunc_with_mulhigh = m - 1;
- if (LIMB_RADIX(radix) > m + 2 && A_low_zeroes <= low_trunc_with_mulhigh)
+ if (LIMB_RADIX(radix) > 2 * (m + 2) && A_low_zeroes <= low_trunc_with_mulhigh)
{
U = TMP_ALLOC((2 + (control_limb + 1)) * sizeof(ulong));
radix_mulmid(U, Ahigh, Ahighn, T, Tn,
@@ -275,7 +325,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
if (Uhighn != 0)
{
/* Compute V = T * U with n fraction limbs. */
- if (LIMB_RADIX(radix) > m + 2)
+ if (LIMB_RADIX(radix) > 2 * (m + 2))
{
// V = T * |1 - A' * T| with n+2 fraction limbs
V = TMP_ALLOC((Tn + Uhighn - (m - 2)) * sizeof(ulong));
@@ -304,6 +354,15 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
}
}
+/* Instead of computing Q with ~1 ulp error and doing a full multiplication
+ to check the remainder, compute a few fraction limbs. If the first
+ fraction limb is not too close to the limb boundary, we have the correct
+ quotient and the remainder can be determined by a low multiplication
+ (or omitted if we only want the quotient). */
+#define USE_PREINV2 1
+/* Must be >= 2 */
+#define PREINV2_EXTRA_LIMBS 2
+
void
radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, nn_srcptr Binv, slong Binvn, const radix_t radix)
{
@@ -319,17 +378,61 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
if (Binvn < n)
flint_throw(FLINT_ERROR, "radix_divrem_preinv: inverse has too few limbs");
- if (LIMB_RADIX(radix) > n + 2)
+ slong n2 = n + PREINV2_EXTRA_LIMBS;
+
+ /*
+ Consider A, B as fixed-point numbers with A in [0,1), 1/B in [1/beta,1),
+ i.e. the integer quotient is floor((A/B)*beta^(an-bn)).
+
+ B' = High n2 limbs of Binv = 1/B * (1 + e1) where e1 <= 4*beta^(-n2)
+ e2 <= beta^(-n2) [truncation error for A]
+ e3 <= n2*beta^(-n2+1) [truncation error for product]
+
+ (A + e2) * B' + e3 - A/B = [A/B*e1 + e3 + e1*e2/B + e2/B]
+ <= [(e1 + e1*e2 + e2)*beta + e3]
+ < [(n2 + 6) beta^(-n2+1)]
+
+ PREINV2_EXTRA_LIMBS >= 2 ensures that q[-1] has <1 ulp error.
+ */
+#if USE_PREINV2
+ if (Binvn >= n2 && An >= n2 && LIMB_RADIX(radix) > n2 + 6)
{
- U = TMP_ALLOC((n + 3) * sizeof(ulong));
- radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix);
- q = U + 2;
+ U = TMP_ALLOC((n2 + 2) * sizeof(ulong));
+ /* n2 fraction limbs x (n2 fraction limbs + 2 integral limbs) */
+ radix_mulmid(U, A + An - n2, n2, Binv + Binvn - n2, n2 + 2, n2, 2 * n2 + 2, radix);
+
+ FLINT_ASSERT(U[n2 + 1] == 0);
+ q = U + n2 + 2 - (n + 1);
+
+ if (q[-1] > 1 && q[-1] < LIMB_RADIX(radix) - 1)
+ {
+ if (R == NULL)
+ r = NULL;
+ else
+ {
+ T = TMP_ALLOC(Bn * sizeof(ulong));
+ r = T;
+ radix_mulmid(r, q, FLINT_MIN(Bn, n + 1), B, Bn, 0, Bn, radix);
+ radix_sub(r, A, Bn, r, Bn, radix);
+ }
+ goto done;
+ }
}
else
+#endif
{
- U = TMP_ALLOC((2 * n + 2) * sizeof(ulong));
- radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix);
- q = U + An - Bn + 2;
+ if (LIMB_RADIX(radix) > n + 2)
+ {
+ U = TMP_ALLOC((n + 3) * sizeof(ulong));
+ radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix);
+ q = U + 2;
+ }
+ else
+ {
+ U = TMP_ALLOC((2 * n + 2) * sizeof(ulong));
+ radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix);
+ q = U + An - Bn + 2;
+ }
}
T = TMP_ALLOC((Bn + n) * sizeof(ulong));
@@ -354,8 +457,10 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
radix_sub(r, r, Bn + 1, B, Bn, radix);
}
+done:
flint_mpn_copyi(Q, q, An - Bn + 1);
- flint_mpn_copyi(R, r, Bn);
+ if (R != NULL)
+ flint_mpn_copyi(R, r, Bn);
TMP_END;
}
@@ -370,15 +475,21 @@ radix_divrem_newton(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon
if (flint_mpn_zero_p(A + Bn, An - Bn) && mpn_cmp(A, B, Bn) < 0)
{
- flint_mpn_copyi(R, A, Bn);
+ if (R != NULL)
+ flint_mpn_copyi(R, A, Bn);
flint_mpn_zero(Q, An - Bn + 1);
return;
}
n = An - Bn + 1;
- T = TMP_ALLOC((n + 2) * sizeof(ulong));
+ T = TMP_ALLOC((n + PREINV2_EXTRA_LIMBS + 2) * sizeof(ulong));
+#if USE_PREINV2
+ radix_inv_approx(T, B, Bn, n + PREINV2_EXTRA_LIMBS, radix);
+ radix_divrem_preinv(Q, R, A, An, B, Bn, T, n + PREINV2_EXTRA_LIMBS, radix);
+#else
radix_inv_approx(T, B, Bn, n, radix);
radix_divrem_preinv(Q, R, A, An, B, Bn, T, n, radix);
+#endif
TMP_END;
return;
}
@@ -392,11 +503,13 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c
if (bn == 1)
{
- r[0] = radix_divrem_1(q, a, an, b[0], radix);
+ ulong r0 = radix_divrem_1(q, a, an, b[0], radix);
+ if (r != NULL)
+ r[0] = r0;
return;
}
- if (bn >= 48)
+ if ((bn >= 20 && (an - bn + 1) <= 70) || bn >= 80)
{
radix_divrem_newton(q, r, a, an, b, bn, radix);
return;
@@ -438,7 +551,8 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c
cy = q[i * bn];
}
- flint_mpn_copyi(r, R, bn);
+ if (r != NULL)
+ flint_mpn_copyi(r, R, bn);
TMP_END;
return;
}
diff --git a/src/radix/integer.c b/src/radix/integer.c
index d6f5969985..d3e7197c12 100644
--- a/src/radix/integer.c
+++ b/src/radix/integer.c
@@ -862,6 +862,48 @@ void radix_integer_tdiv_qr(radix_integer_t q, radix_integer_t r,
r->size = (asize < 0) ? -rn : rn;
}
+void radix_integer_tdiv_q(radix_integer_t q,
+ const radix_integer_t a, const radix_integer_t b, const radix_t radix)
+{
+ slong asize, bsize, an, bn, qn;
+ nn_srcptr ad, bd;
+ nn_ptr qd;
+
+ asize = a->size;
+ bsize = b->size;
+
+ an = FLINT_ABS(asize);
+ bn = FLINT_ABS(bsize);
+
+ if (bn == 0)
+ flint_throw(FLINT_DIVZERO, "radix_integer_tdiv_qr: divide by zero");
+
+ if (an == 0)
+ {
+ radix_integer_zero(q, radix);
+ return;
+ }
+
+ if (an < bn)
+ {
+ radix_integer_zero(q, radix);
+ return;
+ }
+
+ qn = an - bn + 1;
+
+ ad = a->d;
+ bd = b->d;
+
+ qd = radix_integer_fit_limbs(q, qn, radix);
+
+ radix_divrem(qd, NULL, ad, an, bd, bn, radix);
+
+ MPN_NORM(qd, qn);
+
+ q->size = ((asize < 0) == (bsize < 0)) ? qn : -qn;
+}
+
void radix_integer_fdiv_qr(radix_integer_t q, radix_integer_t r,
const radix_integer_t a, const radix_integer_t b, const radix_t radix)
{
@@ -1003,15 +1045,6 @@ void radix_integer_cdiv_qr(radix_integer_t q, radix_integer_t r,
}
/* Todo: optimize */
-void
-radix_integer_tdiv_q(radix_integer_t q,
- const radix_integer_t a, const radix_integer_t b, const radix_t radix)
-{
- radix_integer_t r;
- radix_integer_init(r, radix);
- radix_integer_tdiv_qr(q, r, a, b, radix);
- radix_integer_clear(r, radix);
-}
void
radix_integer_fdiv_q(radix_integer_t q,
diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c
new file mode 100644
index 0000000000..e82f0aefdf
--- /dev/null
+++ b/src/radix/mul_1.c
@@ -0,0 +1,109 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "radix.h"
+
+/* Inline version of flint_mpn_divrem_2_1_preinv_unnorm for speed. */
+FLINT_FORCE_INLINE mp_limb_t
+flint_mpn_divrem_2_1_preinv_unnorm1(mp_ptr qp, mp_srcptr up, mp_limb_t d, mp_limb_t dinv, unsigned int norm)
+{
+ mp_limb_t u0, u1, r;
+
+ FLINT_ASSERT(norm >= 1);
+
+ u1 = up[1];
+ u0 = up[0];
+ if (u1 < d)
+ {
+ d <<= norm;
+ qp[1] = 0;
+ r = (u1 << norm) | (u0 >> (FLINT_BITS - norm));
+ }
+ else
+ {
+ /* Hack: this branch is unreachable, but leaving it in results
+ in faster code with GCC on Zen 3. */
+ d <<= norm;
+ r = (u1 >> (FLINT_BITS - norm));
+ udiv_qrnnd_preinv(qp[1], r, r, (u1 << norm) | (u0 >> (FLINT_BITS - norm)), d, dinv);
+ }
+
+ udiv_qrnnd_preinv(qp[0], r, r, u0 << norm, d, dinv);
+ return r >> norm;
+}
+
+
+ulong
+radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix)
+{
+ slong i;
+ ulong hi, lo;
+ slong sbits;
+ nmod_t mod = radix->B;
+
+ FLINT_ASSERT(n >= 1);
+ FLINT_ASSERT(c < mod.n);
+
+ sbits = 2 * NMOD_BITS(mod);
+
+ if (sbits <= FLINT_BITS)
+ {
+ ulong cy = 0;
+ FLINT_ASSERT(mod.norm != 0);
+
+ for (i = 0; i < n; i++)
+ {
+ cy += a[i] * c;
+ z[i] = n_divrem_preinv_unnorm(&cy, cy, mod.n, mod.ninv, mod.norm);
+ }
+
+ FLINT_ASSERT(cy < mod.n);
+ return cy;
+ }
+ else
+ {
+ if (mod.norm == 0)
+ {
+ ulong cy0 = 0;
+ ulong cy1 = 0;
+ ulong r;
+
+ for (i = 0; i < n; i++)
+ {
+ umul_ppmm(hi, lo, a[i], c);
+ add_ssaaaa(cy1, cy0, cy1, cy0, hi, lo);
+ r = n_divrem_norm(&cy1, cy1, mod.n);
+ udiv_qrnnd_preinv(cy0, r, r, cy0, mod.n, mod.ninv);
+ z[i] = r;
+ }
+
+ FLINT_ASSERT(cy0 < mod.n);
+ FLINT_ASSERT(cy1 == 0);
+ return cy0;
+ }
+ else
+ {
+ ulong cy[2] = { 0, 0, };
+
+ for (i = 0; i < n; i++)
+ {
+ umul_ppmm(hi, lo, a[i], c);
+ add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo);
+ z[i] = flint_mpn_divrem_2_1_preinv_unnorm1(cy, cy, mod.n, mod.ninv, mod.norm);
+ }
+
+ FLINT_ASSERT(cy[0] < mod.n);
+ FLINT_ASSERT(cy[1] == 0);
+ return cy[0];
+ }
+ }
+}
+
diff --git a/src/radix/mulmid_classical.c b/src/radix/mulmid_classical.c
index 6bcf13006e..b415fa2bcc 100644
--- a/src/radix/mulmid_classical.c
+++ b/src/radix/mulmid_classical.c
@@ -242,7 +242,6 @@ radix_sqrmid_classical(nn_ptr z, nn_srcptr a, slong an, slong zlo, slong zhi, co
}
}
-
void
radix_mulmid_classical(nn_ptr z, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong zlo, slong zhi, const radix_t radix)
{
diff --git a/src/radix/profile/p-constants.c b/src/radix/profile/p-constants.c
new file mode 100644
index 0000000000..55486d8c6c
--- /dev/null
+++ b/src/radix/profile/p-constants.c
@@ -0,0 +1,305 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include
+#include "arb.h"
+#include "radix.h"
+#include "double_extras.h"
+#include "mag.h"
+#include "profiler.h"
+#include "hypgeom.h"
+#include "fmpz_poly.h"
+
+/* Don't call arb_const_e because it caches the result. */
+static void
+arb_const_e_eval(arb_t s, slong prec)
+{
+ hypgeom_t series;
+ arb_t t;
+
+ arb_init(t);
+ hypgeom_init(series);
+
+ fmpz_poly_set_str(series->A, "1 1");
+ fmpz_poly_set_str(series->B, "1 1");
+ fmpz_poly_set_str(series->P, "1 1");
+ fmpz_poly_set_str(series->Q, "2 0 1");
+
+ prec += FLINT_CLOG2(prec);
+ arb_hypgeom_infsum(s, t, series, prec, prec);
+ arb_div(s, s, t, prec);
+
+ hypgeom_clear(series);
+ arb_clear(t);
+}
+
+char * radix_integer_get_str_decimal(char * s, const radix_integer_t x, const radix_t radix)
+{
+ return radix_get_str_decimal(s, x->d, FLINT_ABS(x->size), x->size < 0, radix);
+}
+
+slong radix_digits_to_limbs(slong n, const radix_t radix)
+{
+ return (n + radix->exp - 1) / radix->exp;
+}
+
+void
+print_condensed(const char * s, slong ndigits, slong n)
+{
+ char * tmp = flint_malloc(n + 1);
+
+ strncpy(tmp, s, 1);
+ tmp[1] = '\0';
+ flint_printf("%s.", tmp);
+
+ strncpy(tmp, s + 1, n);
+ tmp[n] = '\0';
+
+ flint_printf("%s{...%wd digits...}%s\n", tmp, ndigits - 2 * n - 1, s + ndigits - n);
+
+ flint_free(tmp);
+}
+
+/* Calculate sqrt(a) as a fixed-point number with n fraction limbs. */
+void
+radix_integer_fixed_sqrt_1(radix_integer_t res, ulong a, slong n, const radix_t radix)
+{
+ nn_ptr rd;
+ rd = radix_integer_fit_limbs(res, n + 1, radix);
+ radix_rsqrt_1_approx(rd, a, n, radix);
+ if (a == 2)
+ rd[n] = radix_mul_two(rd, rd, n, radix);
+ else
+ rd[n] = radix_mul_1(rd, rd, n, a, radix);
+ res->size = n + 1;
+}
+
+#define BSPLIT_E_MPN_BASECASE 2
+
+static void
+bsplit_e(radix_integer_t P, radix_integer_t Q, slong a, slong b, const radix_t radix)
+{
+ if (b - a == 1)
+ {
+ radix_integer_one(P, radix);
+ radix_integer_set_ui(Q, b, radix);
+ }
+ else if (b - a == 2 && b < (UWORD(1) << (FLINT_BITS / 2)))
+ {
+ radix_integer_set_ui(P, a + 3, radix);
+ radix_integer_set_ui(Q, (a + 1) * (a + 2), radix);
+ }
+ else if (b - a <= BSPLIT_E_MPN_BASECASE && b < (UWORD(1) << (FLINT_BITS / 2)))
+ {
+ ulong p[BSPLIT_E_MPN_BASECASE];
+ ulong q[BSPLIT_E_MPN_BASECASE];
+ ulong p2, q2;
+ slong pn, qn, k;
+
+ p[0] = a + 3;
+ q[0] = (a + 1) * (a + 2);
+ pn = qn = 1;
+
+ for (k = a + 2; k < b; k++)
+ {
+ if (k + 1 < b) { p2 = k + 3; q2 = (k + 1) * (k + 2); k++; }
+ else { p2 = 1; q2 = k + 1; }
+
+ p[pn] = mpn_mul_1(p, p, pn, q2);
+ p[pn] += mpn_add_1(p, p, pn, p2);
+ pn += (p[pn] != 0);
+ q[qn] = mpn_mul_1(q, q, qn, q2);
+ qn += (q[qn] != 0);
+ }
+
+ P->size = radix_set_mpn(radix_integer_fit_limbs(P,
+ radix_set_mpn_need_alloc(pn, radix), radix), p, pn, radix);
+ Q->size = radix_set_mpn(radix_integer_fit_limbs(Q,
+ radix_set_mpn_need_alloc(qn, radix), radix), q, qn, radix);
+ }
+ else
+ {
+ slong m = a + (b - a) / 2;
+
+ radix_integer_t P1, Q1, P2, Q2;
+
+ radix_integer_init(P1, radix);
+ radix_integer_init(Q1, radix);
+ radix_integer_init(P2, radix);
+ radix_integer_init(Q2, radix);
+
+ bsplit_e(P1, Q1, a, m, radix);
+ bsplit_e(P2, Q2, m, b, radix);
+
+ radix_integer_mul(P, P1, Q2, radix);
+ radix_integer_add(P, P, P2, radix);
+ radix_integer_mul(Q, Q1, Q2, radix);
+
+ radix_integer_clear(P1, radix);
+ radix_integer_clear(Q1, radix);
+ radix_integer_clear(P2, radix);
+ radix_integer_clear(Q2, radix);
+ }
+}
+
+void
+radix_integer_fixed_e(radix_integer_t res, slong n, const radix_t radix)
+{
+ slong N;
+ radix_integer_t P, Q;
+
+ /* Select truncation N */
+ {
+ mag_t one, eps, err;
+
+ double logB = log(LIMB_RADIX(radix));
+ N = n * logB / d_lambertw(n * logB * exp(-1.0)) + 1;
+
+ mag_init(one);
+ mag_init(eps);
+ mag_init(err);
+
+ mag_one(one);
+ mag_set_ui(eps, LIMB_RADIX(radix));
+ mag_inv_lower(eps, eps);
+ mag_pow_ui_lower(eps, eps, n);
+
+ while (1)
+ {
+ mag_exp_tail(err, one, N + 1);
+ if (mag_cmp(err, eps) <= 0)
+ break;
+ else
+ N++;
+ }
+
+ mag_clear(one);
+ mag_clear(err);
+ mag_clear(eps);
+ }
+
+ radix_integer_init(P, radix);
+ radix_integer_init(Q, radix);
+
+ bsplit_e(P, Q, 1, N, radix);
+ radix_integer_lshift_limbs(P, P, n, radix);
+ radix_integer_tdiv_q(res, P, Q, radix);
+ radix_integer_set_limb(res, res, n, 2, radix);
+
+ radix_integer_clear(P, radix);
+ radix_integer_clear(Q, radix);
+}
+
+int main()
+{
+ slong ndigits;
+ radix_t radix;
+ char * s;
+
+ radix_init(radix, 10, 0);
+
+ flint_set_num_threads(1);
+
+ for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10)
+ {
+ radix_integer_t x;
+ arb_t y;
+
+ slong prec_bits = 3.32192809488736 * ndigits + 1;
+ slong prec_radix = radix_digits_to_limbs(ndigits, radix);
+
+ flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n",
+ n_clog(ndigits, 10),
+ prec_bits, prec_radix, prec_radix * radix->exp);
+
+ arb_init(y);
+ flint_printf("Arb compute sqrt(2): ");
+ TIMEIT_START;
+ arb_sqrt_ui(y, 2, prec_bits);
+ TIMEIT_STOP;
+
+ flint_printf("Arb to string: ");
+ s = NULL;
+ TIMEIT_START;
+ flint_free(s);
+ s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20);
+ TIMEIT_STOP;
+
+ flint_printf("%s\n", s);
+ arb_clear(y);
+
+ radix_integer_init(x, radix);
+ flint_printf("Radix compute sqrt(2): ");
+ TIMEIT_START;
+ radix_integer_fixed_sqrt_1(x, 2, prec_radix, radix);
+ TIMEIT_STOP;
+
+ flint_printf("Radix to string: ");
+ s = NULL;
+ TIMEIT_START;
+ flint_free(s);
+ s = radix_integer_get_str_decimal(NULL, x, radix);
+ TIMEIT_STOP;
+
+ print_condensed(s, ndigits, 20);
+ radix_integer_clear(x, radix);
+ }
+
+ for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10)
+ {
+ radix_integer_t x;
+ arb_t y;
+
+ slong prec_bits = 3.32192809488736 * ndigits + 1;
+ slong prec_radix = radix_digits_to_limbs(ndigits, radix);
+
+ flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n",
+ n_clog(ndigits, 10),
+ prec_bits, prec_radix, prec_radix * radix->exp);
+
+ arb_init(y);
+ flint_printf("Arb compute e: ");
+ TIMEIT_START;
+ arb_const_e_eval(y, prec_bits);
+ TIMEIT_STOP;
+
+ flint_printf("Arb to string: ");
+ s = NULL;
+ TIMEIT_START;
+ flint_free(s);
+ s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20);
+ TIMEIT_STOP;
+
+ flint_printf("%s\n", s);
+ arb_clear(y);
+
+ radix_integer_init(x, radix);
+ flint_printf("Radix compute e: ");
+ TIMEIT_START;
+ radix_integer_fixed_e(x, prec_radix, radix);
+ TIMEIT_STOP;
+
+ flint_printf("Radix to string: ");
+ s = NULL;
+ TIMEIT_START;
+ flint_free(s);
+ s = radix_integer_get_str_decimal(NULL, x, radix);
+ TIMEIT_STOP;
+
+ print_condensed(s, ndigits, 20);
+ radix_integer_clear(x, radix);
+ }
+
+ radix_clear(radix);
+ flint_cleanup_master();
+ return 0;
+}
+
diff --git a/src/radix/profile/p-tdiv_q.c b/src/radix/profile/p-tdiv_q.c
new file mode 100644
index 0000000000..af99553fb0
--- /dev/null
+++ b/src/radix/profile/p-tdiv_q.c
@@ -0,0 +1,107 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include
+#include "profiler.h"
+#include "radix.h"
+#include "fmpz.h"
+
+/* there is no flint_mpn_tdiv_q yet, so mock one up */
+void
+flint_mpn_tdiv_q(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn)
+{
+ if (bn < 1000)
+ {
+ mpn_tdiv_qr(q, r, 0, a, an, b, bn);
+ }
+ else
+ {
+ fmpz_t fa, fb, fq;
+
+ fmpz_init(fa);
+ fmpz_init(fb);
+ fmpz_init(fq);
+
+ fmpz_set_ui_array(fa, a, an);
+ fmpz_set_ui_array(fb, b, bn);
+
+ fmpz_tdiv_q(fq, fa, fb);
+
+ //fmpz_get_ui_array(q, an - bn + 1, fq);
+ //fmpz_get_ui_array(r, bn, fr);
+
+ fmpz_clear(fa);
+ fmpz_clear(fb);
+ fmpz_clear(fq);
+ }
+}
+
+int main()
+{
+ radix_t radix;
+ slong digits, nd, n1, n2;
+ nn_ptr a, b, c, d;
+ double tmpn, tradix, FLINT_SET_BUT_UNUSED(tt);
+
+ flint_rand_t state;
+ flint_rand_init(state);
+
+ flint_printf(" decimal mpn decimal time time relative\n");
+ flint_printf(" digits limbs limbs flint_mpn_tdiv_q radix_tdiv_q time\n\n");
+
+ for (nd = 1; nd <= 100000000; nd = FLINT_MAX(nd + 1, nd * 1.5))
+ {
+ radix_init(radix, 10, 0);
+
+ digits = nd * 19;
+
+ /* Number of full-word limbs */
+ n1 = (slong) (digits * (log(10) / (FLINT_BITS * log(2))) + 1.0);
+ /* Number of radix 10^e limbs */
+ n2 = (digits + radix->exp - 1) / radix->exp;
+
+ a = flint_malloc(2 * n2 * sizeof(ulong));
+ b = flint_malloc(n2 * sizeof(ulong));
+ c = flint_malloc((n2 + 1) * sizeof(ulong));
+ d = flint_malloc(n2 * sizeof(ulong));
+
+ flint_mpn_urandomb(a, state, 2 * n1 * FLINT_BITS);
+ flint_mpn_urandomb(b, state, n1 * FLINT_BITS);
+
+ flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1);
+ TIMEIT_START;
+ flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1);
+ TIMEIT_STOP_VALUES(tt, tmpn);
+
+ radix_rand_limbs(a, state, 2 * n2, radix);
+ radix_rand_limbs(b, state, n2, radix);
+
+ radix_divrem(c, NULL, a, 2 * n2, b, n2, radix);
+ TIMEIT_START;
+ radix_divrem(c, NULL, a, 2 * n2, b, n2, radix);
+ TIMEIT_STOP_VALUES(tt, tradix);
+
+ flint_printf("%10wd %8wd %8wd %8g %8g %.3fx\n",
+ digits, n1, n2, tmpn, tradix, tradix / tmpn);
+
+ flint_free(a);
+ flint_free(b);
+ flint_free(c);
+ flint_free(d);
+
+ radix_clear(radix);
+ }
+
+ flint_rand_clear(state);
+ flint_cleanup_master();
+ return 0;
+}
+
diff --git a/src/radix/sqrt.c b/src/radix/sqrt.c
new file mode 100644
index 0000000000..175099980f
--- /dev/null
+++ b/src/radix/sqrt.c
@@ -0,0 +1,129 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "radix.h"
+
+void radix_rsqrt_1_approx_basecase(nn_ptr Y, ulong a, slong n, const radix_t radix)
+{
+ nn_ptr U, V;
+ slong nU, nV;
+ TMP_INIT;
+
+ TMP_START;
+ U = TMP_ALLOC((4 * n + 2) * sizeof(ulong));
+ V = U + 2 * n + 1;
+
+ flint_mpn_zero(U, 2 * n);
+ U[2 * n] = a;
+
+ nV = radix_get_mpn(V, U, 2 * n + 1, radix);
+ mpn_sqrtrem(U, NULL, V, nV);
+ nU = (nV + 1) / 2;
+ MPN_NORM(U, nU);
+ mpn_divrem_1(U, 0, U, nU, a);
+ MPN_NORM(U, nU);
+
+ nV = radix_set_mpn(V, U, nU, radix);
+
+ flint_mpn_copyi(Y, V, nV);
+ flint_mpn_zero(Y + nV, n - nV);
+
+ TMP_END;
+}
+
+// todo: allocs
+// todo: fast mul_1
+// todo: fast divrem_two
+
+/*
+The error is smaller than 2 ulp, and for large radix, 1+eps ulp. Proof sketch:
+suppose on input y = 1/sqrt(a) + eps where we assume that |eps| <= C*B^(-m)
+for some C < 2. The mathematical error after the Newton iteration is
+
+ 1/sqrt(a) - 3*sqrt(a)/2 * eps^2 - a/2 * eps^3.
+
+The calculation of 1 - a*y^2 is done exactly and generates a fixed-point number
+with 2m-limb precision. Subsequently dividing by 2 generates up to B^(-2m)
+error. Rounding the result of the final multiplication by y generates up to
+B^(-n) error, plus B^(-2m) error if mulhigh is used.
+
+Note that m >= n/2 + 1. Setting e.g. C = 1.7 one can check explicitly that
+the error is bounded by C*B^(-n) in the worst case B = 3.
+For larger B one can choose C closer to 1.
+*/
+
+void radix_rsqrt_1_approx(nn_ptr Y, ulong a, slong n, const radix_t radix)
+{
+ if (n <= 4)
+ {
+ radix_rsqrt_1_approx_basecase(Y, a, n, radix);
+ }
+ else
+ {
+ slong m, Un;
+ ulong cy;
+ nn_ptr T, U;
+ slong Talloc, Ualloc;
+ nn_ptr Yhi, Thi;
+ TMP_INIT;
+
+ m = (n + 1) / 2 + 1;
+
+ Yhi = Y + n - m;
+ radix_rsqrt_1_approx(Yhi, a, m, radix);
+ flint_mpn_zero(Y, n - m);
+
+ /* TODO: reduce temporary space */
+ if (LIMB_RADIX(radix) > m)
+ Talloc = m + 3; /* In case of high product */
+ else
+ Talloc = 2 * m + 2; /* In case of full product */
+ Ualloc = m + 2;
+
+ TMP_START;
+ T = TMP_ALLOC((Talloc + Ualloc) * sizeof(ulong));
+ U = T + Talloc;
+
+ /* T = Yhi^2 with 2m fraction limbs */
+ radix_mulmid(T, Yhi, m, Yhi, m, 0, m + 2, radix);
+ radix_mulmid(U, T, m + 2, &a, 1, 0, m + 2, radix);
+ cy = (U[m + 1] == 0);
+ /* a*y^2-1 or 1-a*y^2 */
+ if (!cy)
+ radix_neg(U, U, m + 2, radix);
+
+ radix_divrem_two(U, U, m + 2, radix);
+ Un = m + 2;
+ MPN_NORM(U, Un);
+ if (Un + n - 2 * m <= 0)
+ goto cleanup;
+
+ if (LIMB_RADIX(radix) > m)
+ {
+ radix_mulmid(T, Yhi, m, U, Un, m - 2, Un + m, radix);
+ Thi = T + 2;
+ }
+ else
+ {
+ radix_mulmid(T, Yhi, m, U, Un, 0, Un + m, radix);
+ Thi = T + m;
+ }
+
+ if (cy)
+ radix_sub(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix);
+ else
+ radix_add(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix);
+
+cleanup:
+ TMP_END;
+ }
+}
+
diff --git a/src/radix/test/main.c b/src/radix/test/main.c
index 4d33bd358c..d5bb3ed9c6 100644
--- a/src/radix/test/main.c
+++ b/src/radix/test/main.c
@@ -18,10 +18,12 @@
#include "t-integer.c"
#include "t-inv_approx.c"
#include "t-invmod_bn.c"
+#include "t-mul_1.c"
#include "t-mulmid_classical.c"
#include "t-mulmid_fft_small.c"
#include "t-mulmid_KS.c"
#include "t-neg.c"
+#include "t-rsqrt_1_approx.c"
#include "t-set_mpn.c"
#include "t-sub.c"
@@ -36,10 +38,12 @@ test_struct tests[] =
TEST_FUNCTION(radix_integer),
TEST_FUNCTION(radix_inv_approx),
TEST_FUNCTION(radix_invmod_bn),
+ TEST_FUNCTION(radix_mul_1),
TEST_FUNCTION(radix_mulmid_classical),
TEST_FUNCTION(radix_mulmid_fft_small),
TEST_FUNCTION(radix_mulmid_KS),
TEST_FUNCTION(radix_neg),
+ TEST_FUNCTION(radix_rsqrt_1_approx),
TEST_FUNCTION(radix_set_mpn),
TEST_FUNCTION(radix_sub),
};
diff --git a/src/radix/test/t-divrem.c b/src/radix/test/t-divrem.c
index 2532425750..ce72875069 100644
--- a/src/radix/test/t-divrem.c
+++ b/src/radix/test/t-divrem.c
@@ -19,7 +19,7 @@ TEST_FUNCTION_START(radix_divrem, state)
for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
{
radix_t radix;
- nn_ptr a, b, q, r, bq, bqr;
+ nn_ptr a, b, q, q2, r, bq, bqr;
slong an, bn, qn, rn;
int algorithm, aliasing;
@@ -35,6 +35,7 @@ TEST_FUNCTION_START(radix_divrem, state)
a = flint_malloc(an * sizeof(ulong));
b = flint_malloc(bn * sizeof(ulong));
q = flint_malloc(qn * sizeof(ulong));
+ q2 = flint_malloc(qn * sizeof(ulong));
r = flint_malloc(rn * sizeof(ulong));
bq = flint_malloc(an * sizeof(ulong));
bqr = flint_malloc(an * sizeof(ulong));
@@ -123,11 +124,37 @@ TEST_FUNCTION_START(radix_divrem, state)
flint_abort();
}
+ aliasing = n_randint(state, 2);
+
+ if (aliasing == 0)
+ {
+ divrem_func(q2, NULL, a, an, b, bn, radix);
+ }
+ else
+ {
+ q2 = flint_realloc(q2, an * sizeof(ulong));
+ flint_mpn_copyi(q2, a, an);
+ divrem_func(q2, NULL, q2, an, b, bn, radix);
+ }
+
+ if (mpn_cmp(q, q2, qn) != 0)
+ {
+ flint_printf("FAIL: radix_divrem (r == NULL)\n");
+ flint_printf("algorithm = %d\n", algorithm);
+ flint_printf("aliasing = %d\n", aliasing);
+ flint_printf("a = %{ulong*}\n", a, an);
+ flint_printf("b = %{ulong*}\n", b, bn);
+ flint_printf("q = %{ulong*}\n", q, qn);
+ flint_printf("q2 = %{ulong*}\n", q2, qn);
+ flint_abort();
+ }
+
radix_clear(radix);
flint_free(a);
flint_free(b);
flint_free(q);
+ flint_free(q2);
flint_free(r);
flint_free(bq);
flint_free(bqr);
diff --git a/src/radix/test/t-divrem_1.c b/src/radix/test/t-divrem_1.c
index ad095494de..d134e3ab3d 100644
--- a/src/radix/test/t-divrem_1.c
+++ b/src/radix/test/t-divrem_1.c
@@ -108,6 +108,19 @@ TEST_FUNCTION_START(radix_divrem_1, state)
flint_abort();
}
+ r1 = radix_divrem_two(c, a, an, radix);
+ r2 = radix_divrem_1_naive(d, a, an, 2, radix);
+
+ if (r1 != r2 || mpn_cmp(c, d, an) != 0)
+ {
+ flint_printf("FAIL: divrem_two\n");
+ flint_printf("%{ulong*}\n", a, an);
+ flint_printf("%{ulong*}\n", c, an);
+ flint_printf("%{ulong*}\n", d, an);
+ flint_printf("%wu\n%wu\n\n", r1, r2);
+ flint_abort();
+ }
+
radix_clear(radix);
flint_free(a);
diff --git a/src/radix/test/t-mul_1.c b/src/radix/test/t-mul_1.c
new file mode 100644
index 0000000000..f51021c68b
--- /dev/null
+++ b/src/radix/test/t-mul_1.c
@@ -0,0 +1,72 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "test_helpers.h"
+#include "radix.h"
+
+TEST_FUNCTION_START(radix_mul_1, state)
+{
+ slong iter;
+
+ for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
+ {
+ radix_t radix;
+ nn_ptr a, b, c;
+ ulong x;
+ slong n;
+ ulong cy;
+
+ radix_init_randtest(radix, state);
+
+ n = 1 + n_randint(state, 5);
+
+ a = flint_malloc(n * sizeof(ulong));
+ b = flint_malloc(n * sizeof(ulong));
+ c = flint_malloc((n + 1) * sizeof(ulong));
+
+ radix_randtest_limbs(a, state, n, radix);
+ radix_randtest_limbs(b, state, n, radix);
+ radix_randtest_limbs(c, state, n + 1, radix);
+ radix_randtest_limbs(&x, state, 1, radix);
+
+ if (n_randint(state, 2))
+ {
+ cy = radix_mul_1(b, a, n, x, radix);
+ }
+ else
+ {
+ flint_mpn_copyi(b, a, n);
+ cy = radix_mul_1(b, b, n, x, radix);
+ }
+
+ radix_mulmid_naive(c, a, n, &x, 1, 0, n + 1, radix);
+
+ if (mpn_cmp(b, c, n) != 0 || cy != c[n])
+ {
+ flint_printf("FAIL: mul_1\n");
+ flint_printf("radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n);
+ flint_printf("a = %{ulong*}\n", a, n);
+ flint_printf("b = %{ulong*}\n", b, n);
+ flint_printf("c = %{ulong*}\n", c, n + 1);
+ flint_printf("x = %wu\n", x);
+ flint_printf("cy = %wu\n", cy);
+ flint_abort();
+ }
+
+ radix_clear(radix);
+
+ flint_free(a);
+ flint_free(b);
+ flint_free(c);
+ }
+
+ TEST_FUNCTION_END(state);
+}
diff --git a/src/radix/test/t-rsqrt_1_approx.c b/src/radix/test/t-rsqrt_1_approx.c
new file mode 100644
index 0000000000..c598bfe213
--- /dev/null
+++ b/src/radix/test/t-rsqrt_1_approx.c
@@ -0,0 +1,88 @@
+/*
+ Copyright (C) 2026 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include
+#include "test_helpers.h"
+#include "radix.h"
+
+TEST_FUNCTION_START(radix_rsqrt_1_approx, state)
+{
+ slong iter;
+
+ for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++)
+ {
+ nn_ptr A, B, err;
+ ulong a;
+ slong n, errn, err_limbs = 10;
+ double dbl_err, dbl_powB;
+ slong j;
+
+ radix_t radix;
+ radix_init_randtest(radix, state);
+
+ if (n_randint(state, 100))
+ n = 1 + n_randint(state, 50);
+ else
+ n = 1 + n_randint(state, 500);
+
+ a = n_randint(state, LIMB_RADIX(radix));
+
+ if (a > 1)
+ {
+ A = flint_malloc(sizeof(ulong) * (n + err_limbs));
+ B = flint_malloc(sizeof(ulong) * (n + err_limbs));
+ err = flint_malloc(sizeof(ulong) * (n + err_limbs));
+
+ radix_randtest_limbs(A, state, n, radix);
+ radix_randtest_limbs(B, state, n, radix);
+
+ radix_rsqrt_1_approx_basecase(A, a, n + err_limbs, radix);
+ flint_mpn_zero(B, n + err_limbs);
+ flint_mpn_zero(B, err_limbs);
+ radix_rsqrt_1_approx(B + err_limbs, a, n, radix);
+
+ if (mpn_cmp(A, B, n + err_limbs) >= 0)
+ radix_sub(err, A, n + err_limbs, B, n + err_limbs, radix);
+ else
+ radix_sub(err, B, n + err_limbs, B, n + err_limbs, radix);
+
+ errn = n + err_limbs;
+ MPN_NORM(err, errn);
+
+ dbl_err = 0.0;
+ dbl_powB = pow(LIMB_RADIX(radix), -err_limbs);
+ for (j = 0; j < errn; j++)
+ {
+ dbl_err += dbl_powB * err[j];
+ dbl_powB *= LIMB_RADIX(radix);
+ }
+
+ if (dbl_err > 1.7)
+ {
+ flint_printf("FAIL: too large error: radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n);
+ flint_printf("a = %wu\n", a);
+ flint_printf("A = %{ulong*}\n", A, n + err_limbs);
+ flint_printf("B = %{ulong*}\n", B, n + err_limbs);
+ flint_printf("err = %{ulong*}\n", err, n + err_limbs);
+ flint_printf("dbl_err: %f\n", dbl_err);
+ flint_abort();
+ }
+
+ flint_free(A);
+ flint_free(B);
+ flint_free(err);
+ }
+
+ radix_clear(radix);
+ }
+
+ TEST_FUNCTION_END(state);
+}