From c50b5b934cd4ca064ef7ccd4109316ba9b4854eb Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Thu, 19 Feb 2026 13:57:49 +0100 Subject: [PATCH 1/5] Updated radix division code; more helper functions; code for approximate square roots of small integers --- doc/source/radix.rst | 30 +++++- src/radix.h | 14 +++ src/radix/div.c | 146 ++++++++++++++++++++++++++---- src/radix/integer.c | 51 +++++++++-- src/radix/mul_1.c | 114 +++++++++++++++++++++++ src/radix/mulmid_classical.c | 1 - src/radix/profile/p-tdiv_q.c | 107 ++++++++++++++++++++++ src/radix/sqrt.c | 129 ++++++++++++++++++++++++++ src/radix/test/main.c | 4 + src/radix/test/t-divrem.c | 29 +++++- src/radix/test/t-divrem_1.c | 13 +++ src/radix/test/t-mul_1.c | 72 +++++++++++++++ src/radix/test/t-rsqrt_1_approx.c | 88 ++++++++++++++++++ 13 files changed, 769 insertions(+), 29 deletions(-) create mode 100644 src/radix/mul_1.c create mode 100644 src/radix/profile/p-tdiv_q.c create mode 100644 src/radix/sqrt.c create mode 100644 src/radix/test/t-mul_1.c create mode 100644 src/radix/test/t-rsqrt_1_approx.c diff --git a/doc/source/radix.rst b/doc/source/radix.rst index 0f32c03c07..161130937b 100644 --- a/doc/source/radix.rst +++ b/doc/source/radix.rst @@ -107,6 +107,15 @@ Except where otherwise noted, the following rules apply: Sets *(res, xn)* to the sum or difference of *(x, xn)* and *(y, yn)*, returning carry or borrow. Requires `xn \ge yn \ge 1`. +.. function:: ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix) + + Sets *(res, xn)* to *(x, xn)* multiplied by *c*, returning + carry out. Requires `1 \le d \le c`. + +.. function:: ulong radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix) + + Sets *(res, xn)* to *(x, xn)* multiplied by 2, returning carry out. + .. function:: void radix_mul(nn_ptr res, nn_srcptr x, slong xn, nn_srcptr y, slong yn, const radix_t radix) Sets *(res, xn + yn)* to the product of *(x, xn)* and *(y, yn)*. @@ -163,6 +172,11 @@ Except where otherwise noted, the following rules apply: Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d*, returning the remainder. Requires `1 \le d \le B - 1`. +.. function:: ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix) + + Sets *(res, xn)* to the quotient of *(x, xn)* divided by 2, returning the + remainder. + .. function:: void radix_divexact_1(nn_ptr res, nn_srcptr x, slong xn, ulong d, const radix_t radix) Sets *(res, xn)* to the quotient of *(x, xn)* divided by *d* assuming that @@ -176,6 +190,8 @@ Except where otherwise noted, the following rules apply: `a \in [1/B, 1)` with `an` fraction limbs, sets `(q, n+2)` to an approximation of `1/a \in (1, B]` with `n` fraction limbs and two integral limbs (the highest limb may be zero). + The relative error is bounded by `4 B^{-n}`, i.e. the absolute error is bounded + by `4 B^{-n} / a`. .. function:: void radix_divrem_via_mpn(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix) void radix_divrem_newton(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix) @@ -183,14 +199,17 @@ Except where otherwise noted, the following rules apply: Sets `(q,an-bn+1)` to the quotient and `(r,bn)` to the remainder of `(a,an)` divided by `(b,bn)`. Requires `an \ge bn \ge 1` and - `b_{bn-1} \ne 0`. + `b_{bn-1} \ne 0`. The user can pass ``NULL`` for `r` to compute + only the quotient. .. function:: void radix_divrem_preinv(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, nn_srcptr binv, slong binvn, const radix_t radix) Similar to :func:`radix_divrem`, but accepts a precomputed inverse of `b` given as `(b, binvn+2)` with `binvn` fraction limbs and two integral limbs, as computed by :func:`radix_inv_approx`. - Currently requires that `binvn \ge an-bn+1`. + Currently requires that `binvn \ge an-bn+1`. Passing an inverse with + several extra limbs can improve performance. The user can pass ``NULL`` + for `r` to compute only the quotient. .. function:: int radix_div(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix) @@ -214,6 +233,13 @@ Except where otherwise noted, the following rules apply: Returns -1, 0 or 1 according to whether *(x, n)* is less, equal or greater than `\lfloor B^n / 2 \rfloor`. We require *n* to be positive. +.. function:: void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix) + void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix) + + Sets `(res,n)` to the fractional limbs of an approximation of `1 / \sqrt{a}`. + Assumes that `2 \le a < B`. The error is bounded by `2 B^{-n}`, i.e. by + 2 fixed-point ulps. + Radix conversion -------------------------------------------------------------------------------- diff --git a/src/radix.h b/src/radix.h index 2af5de3971..12a24d11c9 100644 --- a/src/radix.h +++ b/src/radix.h @@ -79,6 +79,8 @@ radix_sub_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix) /* Multiplication */ +ulong radix_mul_1(nn_ptr res, nn_srcptr a, slong n, ulong c, const radix_t radix); + void radix_mulmid_fft_small(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix); void radix_mulmid_classical(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix); void radix_mulmid_KS(nn_ptr res, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong lo, slong hi, const radix_t radix); @@ -111,10 +113,17 @@ radix_sqr(nn_ptr res, nn_srcptr a, slong an, const radix_t radix) radix_mul(res, a, an, a, an, radix); } +RADIX_INLINE ulong +radix_mul_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix) +{ + return radix_add(res, a, an, a, an, radix); +} + /* Division */ ulong radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix); void radix_divexact_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix); +ulong radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix); void radix_inv_approx_basecase(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix); void radix_inv_approx(nn_ptr q, nn_srcptr a, slong an, slong n, const radix_t radix); @@ -155,6 +164,11 @@ radix_cmp_bn_half(nn_srcptr x, slong n, const radix_t radix) return 0; } +/* Square roots */ + +void radix_rsqrt_1_approx_basecase(nn_ptr res, ulong a, slong n, const radix_t radix); +void radix_rsqrt_1_approx(nn_ptr res, ulong a, slong n, const radix_t radix); + /* Radix conversion */ typedef struct diff --git a/src/radix/div.c b/src/radix/div.c index 1b59768619..7ff5a33bb1 100644 --- a/src/radix/div.c +++ b/src/radix/div.c @@ -12,6 +12,29 @@ #include "longlong.h" #include "radix.h" +ulong +radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix) +{ + slong i; + ulong q, r, hi, lo, B = LIMB_RADIX(radix); + + q = a[an - 1] >> 1; + r = a[an - 1] & 1; + res[an - 1] = q; + + for (i = an - 2; i >= 0; i--) + { + umul_ppmm(hi, lo, r, B); + add_ssaaaa(hi, lo, hi, lo, 0, a[i]); + q = (hi << (FLINT_BITS - 1)) | (lo >> 1); + r = lo & 1; + res[i] = q; + } + + return r; +} + + /* todo: optimise */ ulong radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix) @@ -154,6 +177,33 @@ radix_inv_approx_basecase(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_ /* Must be at least 4 */ #define RADIX_INV_NEWTON_CUTOFF 8 +/* +Claim: the absolute error is bounded by 4*B^(-n)/A. This is not tight. + +Proof sketch: let m = ceil(n/2) + 1 + [B <= 3]. + +The initial approximation is assumed to satisfy + + Y = 1/A + e1 where e1 <= C*B^(-m)/A where we take C = 4. + +We compute Y' = Y + (1 - Y * (A + e2) + e3) * Y + e4 where + + e2 = B^(-n-1) is the error from truncating A, + + e3 = B^(-n) + {(m+2)*B^(-n-1)} + e4 = B^(-n) + {(m+2)*B^(-n-1)} + +are the errors from fixed-point multiplications where the term in curly +brackets appears only when we use mulhigh, when B > 2 * (m + 2) is satisfied. + +Expanding the expression for Y' gives + + |Y' - 1/A| <= A*e1^2 + e1^2*e2 + e1*e3 + e4 - 2*e1*e2/A + e3/A + e2/A^2 + +Verifying that the RHS satisfies the claimed bound is now straightforward +(but exhausting) numerics. +*/ + void radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix) { @@ -173,7 +223,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix) Compute T ~= 1 / A as a fixed-point number with m fraction limbs, 2 integral limbs and store in the high part of Q. */ - m = n / 2 + 1 + (LIMB_RADIX(radix) == 2); + m = (n + 1) / 2 + 1 + (LIMB_RADIX(radix) <= 3); radix_inv_approx(Q + n - m, A, An, m, radix); TMP_START; @@ -232,7 +282,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix) // product. slong low_trunc_with_mulhigh = m - 1; - if (LIMB_RADIX(radix) > m + 2 && A_low_zeroes <= low_trunc_with_mulhigh) + if (LIMB_RADIX(radix) > 2 * (m + 2) && A_low_zeroes <= low_trunc_with_mulhigh) { U = TMP_ALLOC((2 + (control_limb + 1)) * sizeof(ulong)); radix_mulmid(U, Ahigh, Ahighn, T, Tn, @@ -275,7 +325,7 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix) if (Uhighn != 0) { /* Compute V = T * U with n fraction limbs. */ - if (LIMB_RADIX(radix) > m + 2) + if (LIMB_RADIX(radix) > 2 * (m + 2)) { // V = T * |1 - A' * T| with n+2 fraction limbs V = TMP_ALLOC((Tn + Uhighn - (m - 2)) * sizeof(ulong)); @@ -304,6 +354,15 @@ radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix) } } +/* Instead of computing Q with ~1 ulp error and doing a full multiplication + to check the remainder, compute a few fraction limbs. If the first + fraction limb is not too close to the limb boundary, we have the correct + quotient and the remainder can be determined by a low multiplication + (or omitted if we only want the quotient). */ +#define USE_PREINV2 1 +/* Must be >= 2 */ +#define PREINV2_EXTRA_LIMBS 2 + void radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, nn_srcptr Binv, slong Binvn, const radix_t radix) { @@ -319,17 +378,61 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon if (Binvn < n) flint_throw(FLINT_ERROR, "radix_divrem_preinv: inverse has too few limbs"); - if (LIMB_RADIX(radix) > n + 2) + slong n2 = n + PREINV2_EXTRA_LIMBS; + + /* + Consider A, B as fixed-point numbers with A in [0,1), 1/B in [1/beta,1), + i.e. the integer quotient is floor((A/B)*beta^(an-bn)). + + B' = High n2 limbs of Binv = 1/B * (1 + e1) where e1 <= 4*beta^(-n2) + e2 <= beta^(-n2) [truncation error for A] + e3 <= n2*beta^(-n2+1) [truncation error for product] + + (A + e2) * B' + e3 - A/B = [A/B*e1 + e3 + e1*e2/B + e2/B] + <= [(e1 + e1*e2 + e2)*beta + e3] + < [(n2 + 6) beta^(-n2+1)] + + PREINV2_EXTRA_LIMBS >= 2 ensures that q[-1] has <1 ulp error. + */ +#if USE_PREINV2 + if (Binvn >= n2 && An >= n2 && LIMB_RADIX(radix) > n2 + 6) { - U = TMP_ALLOC((n + 3) * sizeof(ulong)); - radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix); - q = U + 2; + U = TMP_ALLOC((n2 + 2) * sizeof(ulong)); + /* n2 fraction limbs x (n2 fraction limbs + 2 integral limbs) */ + radix_mulmid(U, A + An - n2, n2, Binv + Binvn - n2, n2 + 2, n2, 2 * n2 + 2, radix); + + FLINT_ASSERT(U[n2 + 1] == 0); + q = U + n2 + 2 - (n + 1); + + if (q[-1] > 1 && q[-1] < LIMB_RADIX(radix) - 1) + { + if (R == NULL) + r = NULL; + else + { + T = TMP_ALLOC(Bn * sizeof(ulong)); + r = T; + radix_mulmid(r, q, FLINT_MIN(Bn, n + 1), B, Bn, 0, Bn, radix); + radix_sub(r, A, Bn, r, Bn, radix); + } + goto done; + } } else +#endif { - U = TMP_ALLOC((2 * n + 2) * sizeof(ulong)); - radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix); - q = U + An - Bn + 2; + if (LIMB_RADIX(radix) > n + 2) + { + U = TMP_ALLOC((n + 3) * sizeof(ulong)); + radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix); + q = U + 2; + } + else + { + U = TMP_ALLOC((2 * n + 2) * sizeof(ulong)); + radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix); + q = U + An - Bn + 2; + } } T = TMP_ALLOC((Bn + n) * sizeof(ulong)); @@ -354,8 +457,10 @@ radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon radix_sub(r, r, Bn + 1, B, Bn, radix); } +done: flint_mpn_copyi(Q, q, An - Bn + 1); - flint_mpn_copyi(R, r, Bn); + if (R != NULL) + flint_mpn_copyi(R, r, Bn); TMP_END; } @@ -370,15 +475,21 @@ radix_divrem_newton(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slon if (flint_mpn_zero_p(A + Bn, An - Bn) && mpn_cmp(A, B, Bn) < 0) { - flint_mpn_copyi(R, A, Bn); + if (R != NULL) + flint_mpn_copyi(R, A, Bn); flint_mpn_zero(Q, An - Bn + 1); return; } n = An - Bn + 1; - T = TMP_ALLOC((n + 2) * sizeof(ulong)); + T = TMP_ALLOC((n + PREINV2_EXTRA_LIMBS + 2) * sizeof(ulong)); +#if USE_PREINV2 + radix_inv_approx(T, B, Bn, n + PREINV2_EXTRA_LIMBS, radix); + radix_divrem_preinv(Q, R, A, An, B, Bn, T, n + PREINV2_EXTRA_LIMBS, radix); +#else radix_inv_approx(T, B, Bn, n, radix); radix_divrem_preinv(Q, R, A, An, B, Bn, T, n, radix); +#endif TMP_END; return; } @@ -392,11 +503,13 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c if (bn == 1) { - r[0] = radix_divrem_1(q, a, an, b[0], radix); + ulong r0 = radix_divrem_1(q, a, an, b[0], radix); + if (r != NULL) + r[0] = r0; return; } - if (bn >= 48) + if ((bn >= 20 && (an - bn + 1) <= 70) || bn >= 80) { radix_divrem_newton(q, r, a, an, b, bn, radix); return; @@ -438,7 +551,8 @@ radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, c cy = q[i * bn]; } - flint_mpn_copyi(r, R, bn); + if (r != NULL) + flint_mpn_copyi(r, R, bn); TMP_END; return; } diff --git a/src/radix/integer.c b/src/radix/integer.c index d6f5969985..d3e7197c12 100644 --- a/src/radix/integer.c +++ b/src/radix/integer.c @@ -862,6 +862,48 @@ void radix_integer_tdiv_qr(radix_integer_t q, radix_integer_t r, r->size = (asize < 0) ? -rn : rn; } +void radix_integer_tdiv_q(radix_integer_t q, + const radix_integer_t a, const radix_integer_t b, const radix_t radix) +{ + slong asize, bsize, an, bn, qn; + nn_srcptr ad, bd; + nn_ptr qd; + + asize = a->size; + bsize = b->size; + + an = FLINT_ABS(asize); + bn = FLINT_ABS(bsize); + + if (bn == 0) + flint_throw(FLINT_DIVZERO, "radix_integer_tdiv_qr: divide by zero"); + + if (an == 0) + { + radix_integer_zero(q, radix); + return; + } + + if (an < bn) + { + radix_integer_zero(q, radix); + return; + } + + qn = an - bn + 1; + + ad = a->d; + bd = b->d; + + qd = radix_integer_fit_limbs(q, qn, radix); + + radix_divrem(qd, NULL, ad, an, bd, bn, radix); + + MPN_NORM(qd, qn); + + q->size = ((asize < 0) == (bsize < 0)) ? qn : -qn; +} + void radix_integer_fdiv_qr(radix_integer_t q, radix_integer_t r, const radix_integer_t a, const radix_integer_t b, const radix_t radix) { @@ -1003,15 +1045,6 @@ void radix_integer_cdiv_qr(radix_integer_t q, radix_integer_t r, } /* Todo: optimize */ -void -radix_integer_tdiv_q(radix_integer_t q, - const radix_integer_t a, const radix_integer_t b, const radix_t radix) -{ - radix_integer_t r; - radix_integer_init(r, radix); - radix_integer_tdiv_qr(q, r, a, b, radix); - radix_integer_clear(r, radix); -} void radix_integer_fdiv_q(radix_integer_t q, diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c new file mode 100644 index 0000000000..6ab3ce8576 --- /dev/null +++ b/src/radix/mul_1.c @@ -0,0 +1,114 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "radix.h" + +ulong +radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix) +{ + slong i; + ulong hi, lo; + slong sbits; + nmod_t mod = radix->B; + + FLINT_ASSERT(n >= 1); + FLINT_ASSERT(c < mod.n); + + sbits = 2 * NMOD_BITS(mod); + + if (sbits <= FLINT_BITS) + { + ulong cy = 0; + FLINT_ASSERT(mod.norm != 0); + + for (i = 0; i < n; i++) + { + cy += a[i] * c; + z[i] = n_divrem_preinv_unnorm(&cy, cy, mod.n, mod.ninv, mod.norm); + } + + FLINT_ASSERT(cy < mod.n); + return cy; + } + else if (sbits <= 2 * FLINT_BITS) + { + ulong cy[2] = { 0, 0, }; + + if (mod.norm == 0) + { + for (i = 0; i < n; i++) + { + umul_ppmm(hi, lo, a[i], c); + add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); + z[i] = flint_mpn_divrem_2_1_preinv_norm(cy, cy, mod.n, mod.ninv); + } + } + else + { + for (i = 0; i < n; i++) + { + umul_ppmm(hi, lo, a[i], c); + add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); + z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); + } + } + + FLINT_ASSERT(cy[0] < mod.n); + FLINT_ASSERT(cy[1] == 0); + return cy[0]; + } + else + { + if (mod.norm == 0) + { + ulong r; + ulong cy0 = 0; + ulong cy1 = 0; + ulong cy2 = 0; + + umul_ppmm(hi, lo, a[0], c); + udiv_qrnnd_preinv(cy0, r, hi, lo, mod.n, mod.ninv); + z[0] = r; + + for (i = 1; i < n; i++) + { + umul_ppmm(hi, lo, a[i], c); + add_sssaaaaaa(cy2, cy1, cy0, cy2, cy1, cy0, 0, hi, lo); + /* cy2 is certainly already reduced */ + udiv_qrnnd_preinv(cy1, r, cy2, cy1, mod.n, mod.ninv); + udiv_qrnnd_preinv(cy0, r, r, cy0, mod.n, mod.ninv); + cy2 = 0; + z[i] = r; + } + + FLINT_ASSERT(cy0 < mod.n); + FLINT_ASSERT(cy1 == 0); + return cy0; + } + else + { + ulong cy[3] = { 0, 0, 0 }; + + for (i = 0; i < n; i++) + { + umul_ppmm(hi, lo, a[i], c); + add_sssaaaaaa(cy[2], cy[1], cy[0], cy[2], cy[1], cy[0], 0, hi, lo); + z[i] = flint_mpn_divrem_3_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); + } + + FLINT_ASSERT(cy[0] < mod.n); + FLINT_ASSERT(cy[1] == 0); + FLINT_ASSERT(cy[2] == 0); + return cy[0]; + } + } +} + diff --git a/src/radix/mulmid_classical.c b/src/radix/mulmid_classical.c index 6bcf13006e..b415fa2bcc 100644 --- a/src/radix/mulmid_classical.c +++ b/src/radix/mulmid_classical.c @@ -242,7 +242,6 @@ radix_sqrmid_classical(nn_ptr z, nn_srcptr a, slong an, slong zlo, slong zhi, co } } - void radix_mulmid_classical(nn_ptr z, nn_srcptr a, slong an, nn_srcptr b, slong bn, slong zlo, slong zhi, const radix_t radix) { diff --git a/src/radix/profile/p-tdiv_q.c b/src/radix/profile/p-tdiv_q.c new file mode 100644 index 0000000000..af99553fb0 --- /dev/null +++ b/src/radix/profile/p-tdiv_q.c @@ -0,0 +1,107 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include +#include "profiler.h" +#include "radix.h" +#include "fmpz.h" + +/* there is no flint_mpn_tdiv_q yet, so mock one up */ +void +flint_mpn_tdiv_q(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn) +{ + if (bn < 1000) + { + mpn_tdiv_qr(q, r, 0, a, an, b, bn); + } + else + { + fmpz_t fa, fb, fq; + + fmpz_init(fa); + fmpz_init(fb); + fmpz_init(fq); + + fmpz_set_ui_array(fa, a, an); + fmpz_set_ui_array(fb, b, bn); + + fmpz_tdiv_q(fq, fa, fb); + + //fmpz_get_ui_array(q, an - bn + 1, fq); + //fmpz_get_ui_array(r, bn, fr); + + fmpz_clear(fa); + fmpz_clear(fb); + fmpz_clear(fq); + } +} + +int main() +{ + radix_t radix; + slong digits, nd, n1, n2; + nn_ptr a, b, c, d; + double tmpn, tradix, FLINT_SET_BUT_UNUSED(tt); + + flint_rand_t state; + flint_rand_init(state); + + flint_printf(" decimal mpn decimal time time relative\n"); + flint_printf(" digits limbs limbs flint_mpn_tdiv_q radix_tdiv_q time\n\n"); + + for (nd = 1; nd <= 100000000; nd = FLINT_MAX(nd + 1, nd * 1.5)) + { + radix_init(radix, 10, 0); + + digits = nd * 19; + + /* Number of full-word limbs */ + n1 = (slong) (digits * (log(10) / (FLINT_BITS * log(2))) + 1.0); + /* Number of radix 10^e limbs */ + n2 = (digits + radix->exp - 1) / radix->exp; + + a = flint_malloc(2 * n2 * sizeof(ulong)); + b = flint_malloc(n2 * sizeof(ulong)); + c = flint_malloc((n2 + 1) * sizeof(ulong)); + d = flint_malloc(n2 * sizeof(ulong)); + + flint_mpn_urandomb(a, state, 2 * n1 * FLINT_BITS); + flint_mpn_urandomb(b, state, n1 * FLINT_BITS); + + flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1); + TIMEIT_START; + flint_mpn_tdiv_q(c, d, a, 2 * n1, b, n1); + TIMEIT_STOP_VALUES(tt, tmpn); + + radix_rand_limbs(a, state, 2 * n2, radix); + radix_rand_limbs(b, state, n2, radix); + + radix_divrem(c, NULL, a, 2 * n2, b, n2, radix); + TIMEIT_START; + radix_divrem(c, NULL, a, 2 * n2, b, n2, radix); + TIMEIT_STOP_VALUES(tt, tradix); + + flint_printf("%10wd %8wd %8wd %8g %8g %.3fx\n", + digits, n1, n2, tmpn, tradix, tradix / tmpn); + + flint_free(a); + flint_free(b); + flint_free(c); + flint_free(d); + + radix_clear(radix); + } + + flint_rand_clear(state); + flint_cleanup_master(); + return 0; +} + diff --git a/src/radix/sqrt.c b/src/radix/sqrt.c new file mode 100644 index 0000000000..3c7812fbf0 --- /dev/null +++ b/src/radix/sqrt.c @@ -0,0 +1,129 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "radix.h" + +void radix_rsqrt_1_approx_basecase(nn_ptr Y, ulong a, slong n, const radix_t radix) +{ + nn_ptr U, V; + slong nU, nV; + TMP_INIT; + + TMP_START; + U = TMP_ALLOC((4 * n + 2) * sizeof(ulong)); + V = U + 2 * n + 1; + + flint_mpn_zero(U, 2 * n); + U[2 * n] = a; + + nV = radix_get_mpn(V, U, 2 * n + 1, radix); + mpn_sqrtrem(U, NULL, V, nV); + nU = (nV + 1) / 2; + MPN_NORM(U, nU); + mpn_divrem_1(U, 0, U, nU, a); + MPN_NORM(U, nU); + + nV = radix_set_mpn(V, U, nU, radix); + + flint_mpn_copyi(Y, V, nV); + flint_mpn_zero(Y + nV, n - nV); + + TMP_END; +} + +// todo: allocs +// todo: fast mul_1 +// todo: fast divrem_two + +/* +The error is smaller than 2 ulp, and for large radix, 1+eps ulp. Proof sketch: +suppose on input y = 1/sqrt(a) + eps where we assume that |eps| <= C*B^(-m) +for some C < 2. The mathematical error after the Newton iteration is + + 1/sqrt(a) - 3*sqrt(a)/2 * eps^2 - a/2 * eps^3. + +The calculation of 1 - a*y^2 is done exactly and generates a fixed-point number +with 2m-limb precision. Subsequently dividing by 2 generates up to B^(-2m) +error. Rounding the result of the final multiplication by y generates up to +B^(-n) error, plus B^(-2m) error if mulhigh is used. + +Note that m >= n/2 + 1. Setting e.g. C = 1.7 one can check explicitly that +the error is bounded by C*B^(-n) in the worst case B = 3. +For larger B one can choose C closer to 1. +*/ + +void radix_rsqrt_1_approx(nn_ptr Y, ulong a, slong n, const radix_t radix) +{ + if (n <= 4) + { + radix_rsqrt_1_approx_basecase(Y, a, n, radix); + } + else + { + slong m, Un; + ulong cy; + nn_ptr T, U; + slong Talloc, Ualloc; + nn_ptr Yhi, Thi; + TMP_INIT; + + m = (n + 1) / 2 + 1; + + Yhi = Y + n - m; + radix_rsqrt_1_approx(Yhi, a, m, radix); + flint_mpn_zero(Y, n - m); + + /* TODO: reduce temporary space */ + if (LIMB_RADIX(radix) > m) + Talloc = m + 3; /* In case of high product */ + else + Talloc = 2 * m + 2; /* In case of full product */ + Ualloc = m + 2; + + TMP_START; + T = TMP_ALLOC((Talloc + Ualloc) * sizeof(ulong)); + U = T + Talloc; + + /* T = Yhi^2 with 2m fraction limbs */ + radix_mulmid(T, Yhi, m, Yhi, m, 0, m + 2, radix); + radix_mulmid(U, T, m + 2, &a, 1, 0, m + 2, radix); + cy = (U[m + 1] == 0); + /* a*y^2-1 or 1-a*y^2 */ + if (!cy) + radix_neg(U, U, m + 2, radix); + + radix_divrem_two(U, U, m + 2, radix); + Un = m + 2; + MPN_NORM(U, Un); + if (Un == 0) + goto cleanup; + + if (LIMB_RADIX(radix) > m) + { + radix_mulmid(T, Yhi, m, U, Un, m - 2, Un + m, radix); + Thi = T + 2; + } + else + { + radix_mulmid(T, Yhi, m, U, Un, 0, Un + m, radix); + Thi = T + m; + } + + if (cy) + radix_sub(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix); + else + radix_add(Y, Y, n, Thi + 2 * m - n, Un + n - 2 * m, radix); + +cleanup: + TMP_END; + } +} + diff --git a/src/radix/test/main.c b/src/radix/test/main.c index 4d33bd358c..d5bb3ed9c6 100644 --- a/src/radix/test/main.c +++ b/src/radix/test/main.c @@ -18,10 +18,12 @@ #include "t-integer.c" #include "t-inv_approx.c" #include "t-invmod_bn.c" +#include "t-mul_1.c" #include "t-mulmid_classical.c" #include "t-mulmid_fft_small.c" #include "t-mulmid_KS.c" #include "t-neg.c" +#include "t-rsqrt_1_approx.c" #include "t-set_mpn.c" #include "t-sub.c" @@ -36,10 +38,12 @@ test_struct tests[] = TEST_FUNCTION(radix_integer), TEST_FUNCTION(radix_inv_approx), TEST_FUNCTION(radix_invmod_bn), + TEST_FUNCTION(radix_mul_1), TEST_FUNCTION(radix_mulmid_classical), TEST_FUNCTION(radix_mulmid_fft_small), TEST_FUNCTION(radix_mulmid_KS), TEST_FUNCTION(radix_neg), + TEST_FUNCTION(radix_rsqrt_1_approx), TEST_FUNCTION(radix_set_mpn), TEST_FUNCTION(radix_sub), }; diff --git a/src/radix/test/t-divrem.c b/src/radix/test/t-divrem.c index 2532425750..ce72875069 100644 --- a/src/radix/test/t-divrem.c +++ b/src/radix/test/t-divrem.c @@ -19,7 +19,7 @@ TEST_FUNCTION_START(radix_divrem, state) for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++) { radix_t radix; - nn_ptr a, b, q, r, bq, bqr; + nn_ptr a, b, q, q2, r, bq, bqr; slong an, bn, qn, rn; int algorithm, aliasing; @@ -35,6 +35,7 @@ TEST_FUNCTION_START(radix_divrem, state) a = flint_malloc(an * sizeof(ulong)); b = flint_malloc(bn * sizeof(ulong)); q = flint_malloc(qn * sizeof(ulong)); + q2 = flint_malloc(qn * sizeof(ulong)); r = flint_malloc(rn * sizeof(ulong)); bq = flint_malloc(an * sizeof(ulong)); bqr = flint_malloc(an * sizeof(ulong)); @@ -123,11 +124,37 @@ TEST_FUNCTION_START(radix_divrem, state) flint_abort(); } + aliasing = n_randint(state, 2); + + if (aliasing == 0) + { + divrem_func(q2, NULL, a, an, b, bn, radix); + } + else + { + q2 = flint_realloc(q2, an * sizeof(ulong)); + flint_mpn_copyi(q2, a, an); + divrem_func(q2, NULL, q2, an, b, bn, radix); + } + + if (mpn_cmp(q, q2, qn) != 0) + { + flint_printf("FAIL: radix_divrem (r == NULL)\n"); + flint_printf("algorithm = %d\n", algorithm); + flint_printf("aliasing = %d\n", aliasing); + flint_printf("a = %{ulong*}\n", a, an); + flint_printf("b = %{ulong*}\n", b, bn); + flint_printf("q = %{ulong*}\n", q, qn); + flint_printf("q2 = %{ulong*}\n", q2, qn); + flint_abort(); + } + radix_clear(radix); flint_free(a); flint_free(b); flint_free(q); + flint_free(q2); flint_free(r); flint_free(bq); flint_free(bqr); diff --git a/src/radix/test/t-divrem_1.c b/src/radix/test/t-divrem_1.c index ad095494de..d134e3ab3d 100644 --- a/src/radix/test/t-divrem_1.c +++ b/src/radix/test/t-divrem_1.c @@ -108,6 +108,19 @@ TEST_FUNCTION_START(radix_divrem_1, state) flint_abort(); } + r1 = radix_divrem_two(c, a, an, radix); + r2 = radix_divrem_1_naive(d, a, an, 2, radix); + + if (r1 != r2 || mpn_cmp(c, d, an) != 0) + { + flint_printf("FAIL: divrem_two\n"); + flint_printf("%{ulong*}\n", a, an); + flint_printf("%{ulong*}\n", c, an); + flint_printf("%{ulong*}\n", d, an); + flint_printf("%wu\n%wu\n\n", r1, r2); + flint_abort(); + } + radix_clear(radix); flint_free(a); diff --git a/src/radix/test/t-mul_1.c b/src/radix/test/t-mul_1.c new file mode 100644 index 0000000000..f51021c68b --- /dev/null +++ b/src/radix/test/t-mul_1.c @@ -0,0 +1,72 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "radix.h" + +TEST_FUNCTION_START(radix_mul_1, state) +{ + slong iter; + + for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++) + { + radix_t radix; + nn_ptr a, b, c; + ulong x; + slong n; + ulong cy; + + radix_init_randtest(radix, state); + + n = 1 + n_randint(state, 5); + + a = flint_malloc(n * sizeof(ulong)); + b = flint_malloc(n * sizeof(ulong)); + c = flint_malloc((n + 1) * sizeof(ulong)); + + radix_randtest_limbs(a, state, n, radix); + radix_randtest_limbs(b, state, n, radix); + radix_randtest_limbs(c, state, n + 1, radix); + radix_randtest_limbs(&x, state, 1, radix); + + if (n_randint(state, 2)) + { + cy = radix_mul_1(b, a, n, x, radix); + } + else + { + flint_mpn_copyi(b, a, n); + cy = radix_mul_1(b, b, n, x, radix); + } + + radix_mulmid_naive(c, a, n, &x, 1, 0, n + 1, radix); + + if (mpn_cmp(b, c, n) != 0 || cy != c[n]) + { + flint_printf("FAIL: mul_1\n"); + flint_printf("radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n); + flint_printf("a = %{ulong*}\n", a, n); + flint_printf("b = %{ulong*}\n", b, n); + flint_printf("c = %{ulong*}\n", c, n + 1); + flint_printf("x = %wu\n", x); + flint_printf("cy = %wu\n", cy); + flint_abort(); + } + + radix_clear(radix); + + flint_free(a); + flint_free(b); + flint_free(c); + } + + TEST_FUNCTION_END(state); +} diff --git a/src/radix/test/t-rsqrt_1_approx.c b/src/radix/test/t-rsqrt_1_approx.c new file mode 100644 index 0000000000..c598bfe213 --- /dev/null +++ b/src/radix/test/t-rsqrt_1_approx.c @@ -0,0 +1,88 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include +#include "test_helpers.h" +#include "radix.h" + +TEST_FUNCTION_START(radix_rsqrt_1_approx, state) +{ + slong iter; + + for (iter = 0; iter < 10000 * flint_test_multiplier(); iter++) + { + nn_ptr A, B, err; + ulong a; + slong n, errn, err_limbs = 10; + double dbl_err, dbl_powB; + slong j; + + radix_t radix; + radix_init_randtest(radix, state); + + if (n_randint(state, 100)) + n = 1 + n_randint(state, 50); + else + n = 1 + n_randint(state, 500); + + a = n_randint(state, LIMB_RADIX(radix)); + + if (a > 1) + { + A = flint_malloc(sizeof(ulong) * (n + err_limbs)); + B = flint_malloc(sizeof(ulong) * (n + err_limbs)); + err = flint_malloc(sizeof(ulong) * (n + err_limbs)); + + radix_randtest_limbs(A, state, n, radix); + radix_randtest_limbs(B, state, n, radix); + + radix_rsqrt_1_approx_basecase(A, a, n + err_limbs, radix); + flint_mpn_zero(B, n + err_limbs); + flint_mpn_zero(B, err_limbs); + radix_rsqrt_1_approx(B + err_limbs, a, n, radix); + + if (mpn_cmp(A, B, n + err_limbs) >= 0) + radix_sub(err, A, n + err_limbs, B, n + err_limbs, radix); + else + radix_sub(err, B, n + err_limbs, B, n + err_limbs, radix); + + errn = n + err_limbs; + MPN_NORM(err, errn); + + dbl_err = 0.0; + dbl_powB = pow(LIMB_RADIX(radix), -err_limbs); + for (j = 0; j < errn; j++) + { + dbl_err += dbl_powB * err[j]; + dbl_powB *= LIMB_RADIX(radix); + } + + if (dbl_err > 1.7) + { + flint_printf("FAIL: too large error: radix %wu^%wd, n = %wd\n", radix->B.n, radix->exp, n); + flint_printf("a = %wu\n", a); + flint_printf("A = %{ulong*}\n", A, n + err_limbs); + flint_printf("B = %{ulong*}\n", B, n + err_limbs); + flint_printf("err = %{ulong*}\n", err, n + err_limbs); + flint_printf("dbl_err: %f\n", dbl_err); + flint_abort(); + } + + flint_free(A); + flint_free(B); + flint_free(err); + } + + radix_clear(radix); + } + + TEST_FUNCTION_END(state); +} From e0cea1d26c2fee3cc4d32f862c8993ee61d37283 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Thu, 19 Feb 2026 15:32:06 +0100 Subject: [PATCH 2/5] Avoid add of length <= 0 --- src/radix/sqrt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/radix/sqrt.c b/src/radix/sqrt.c index 3c7812fbf0..175099980f 100644 --- a/src/radix/sqrt.c +++ b/src/radix/sqrt.c @@ -103,7 +103,7 @@ void radix_rsqrt_1_approx(nn_ptr Y, ulong a, slong n, const radix_t radix) radix_divrem_two(U, U, m + 2, radix); Un = m + 2; MPN_NORM(U, Un); - if (Un == 0) + if (Un + n - 2 * m <= 0) goto cleanup; if (LIMB_RADIX(radix) > m) From 21167816bef31acc59e6adf4ef70ff8514425094 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Thu, 19 Feb 2026 17:13:32 +0100 Subject: [PATCH 3/5] Radix constants profile program --- src/radix/profile/p-constants.c | 305 ++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 src/radix/profile/p-constants.c diff --git a/src/radix/profile/p-constants.c b/src/radix/profile/p-constants.c new file mode 100644 index 0000000000..55486d8c6c --- /dev/null +++ b/src/radix/profile/p-constants.c @@ -0,0 +1,305 @@ +/* + Copyright (C) 2026 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include +#include "arb.h" +#include "radix.h" +#include "double_extras.h" +#include "mag.h" +#include "profiler.h" +#include "hypgeom.h" +#include "fmpz_poly.h" + +/* Don't call arb_const_e because it caches the result. */ +static void +arb_const_e_eval(arb_t s, slong prec) +{ + hypgeom_t series; + arb_t t; + + arb_init(t); + hypgeom_init(series); + + fmpz_poly_set_str(series->A, "1 1"); + fmpz_poly_set_str(series->B, "1 1"); + fmpz_poly_set_str(series->P, "1 1"); + fmpz_poly_set_str(series->Q, "2 0 1"); + + prec += FLINT_CLOG2(prec); + arb_hypgeom_infsum(s, t, series, prec, prec); + arb_div(s, s, t, prec); + + hypgeom_clear(series); + arb_clear(t); +} + +char * radix_integer_get_str_decimal(char * s, const radix_integer_t x, const radix_t radix) +{ + return radix_get_str_decimal(s, x->d, FLINT_ABS(x->size), x->size < 0, radix); +} + +slong radix_digits_to_limbs(slong n, const radix_t radix) +{ + return (n + radix->exp - 1) / radix->exp; +} + +void +print_condensed(const char * s, slong ndigits, slong n) +{ + char * tmp = flint_malloc(n + 1); + + strncpy(tmp, s, 1); + tmp[1] = '\0'; + flint_printf("%s.", tmp); + + strncpy(tmp, s + 1, n); + tmp[n] = '\0'; + + flint_printf("%s{...%wd digits...}%s\n", tmp, ndigits - 2 * n - 1, s + ndigits - n); + + flint_free(tmp); +} + +/* Calculate sqrt(a) as a fixed-point number with n fraction limbs. */ +void +radix_integer_fixed_sqrt_1(radix_integer_t res, ulong a, slong n, const radix_t radix) +{ + nn_ptr rd; + rd = radix_integer_fit_limbs(res, n + 1, radix); + radix_rsqrt_1_approx(rd, a, n, radix); + if (a == 2) + rd[n] = radix_mul_two(rd, rd, n, radix); + else + rd[n] = radix_mul_1(rd, rd, n, a, radix); + res->size = n + 1; +} + +#define BSPLIT_E_MPN_BASECASE 2 + +static void +bsplit_e(radix_integer_t P, radix_integer_t Q, slong a, slong b, const radix_t radix) +{ + if (b - a == 1) + { + radix_integer_one(P, radix); + radix_integer_set_ui(Q, b, radix); + } + else if (b - a == 2 && b < (UWORD(1) << (FLINT_BITS / 2))) + { + radix_integer_set_ui(P, a + 3, radix); + radix_integer_set_ui(Q, (a + 1) * (a + 2), radix); + } + else if (b - a <= BSPLIT_E_MPN_BASECASE && b < (UWORD(1) << (FLINT_BITS / 2))) + { + ulong p[BSPLIT_E_MPN_BASECASE]; + ulong q[BSPLIT_E_MPN_BASECASE]; + ulong p2, q2; + slong pn, qn, k; + + p[0] = a + 3; + q[0] = (a + 1) * (a + 2); + pn = qn = 1; + + for (k = a + 2; k < b; k++) + { + if (k + 1 < b) { p2 = k + 3; q2 = (k + 1) * (k + 2); k++; } + else { p2 = 1; q2 = k + 1; } + + p[pn] = mpn_mul_1(p, p, pn, q2); + p[pn] += mpn_add_1(p, p, pn, p2); + pn += (p[pn] != 0); + q[qn] = mpn_mul_1(q, q, qn, q2); + qn += (q[qn] != 0); + } + + P->size = radix_set_mpn(radix_integer_fit_limbs(P, + radix_set_mpn_need_alloc(pn, radix), radix), p, pn, radix); + Q->size = radix_set_mpn(radix_integer_fit_limbs(Q, + radix_set_mpn_need_alloc(qn, radix), radix), q, qn, radix); + } + else + { + slong m = a + (b - a) / 2; + + radix_integer_t P1, Q1, P2, Q2; + + radix_integer_init(P1, radix); + radix_integer_init(Q1, radix); + radix_integer_init(P2, radix); + radix_integer_init(Q2, radix); + + bsplit_e(P1, Q1, a, m, radix); + bsplit_e(P2, Q2, m, b, radix); + + radix_integer_mul(P, P1, Q2, radix); + radix_integer_add(P, P, P2, radix); + radix_integer_mul(Q, Q1, Q2, radix); + + radix_integer_clear(P1, radix); + radix_integer_clear(Q1, radix); + radix_integer_clear(P2, radix); + radix_integer_clear(Q2, radix); + } +} + +void +radix_integer_fixed_e(radix_integer_t res, slong n, const radix_t radix) +{ + slong N; + radix_integer_t P, Q; + + /* Select truncation N */ + { + mag_t one, eps, err; + + double logB = log(LIMB_RADIX(radix)); + N = n * logB / d_lambertw(n * logB * exp(-1.0)) + 1; + + mag_init(one); + mag_init(eps); + mag_init(err); + + mag_one(one); + mag_set_ui(eps, LIMB_RADIX(radix)); + mag_inv_lower(eps, eps); + mag_pow_ui_lower(eps, eps, n); + + while (1) + { + mag_exp_tail(err, one, N + 1); + if (mag_cmp(err, eps) <= 0) + break; + else + N++; + } + + mag_clear(one); + mag_clear(err); + mag_clear(eps); + } + + radix_integer_init(P, radix); + radix_integer_init(Q, radix); + + bsplit_e(P, Q, 1, N, radix); + radix_integer_lshift_limbs(P, P, n, radix); + radix_integer_tdiv_q(res, P, Q, radix); + radix_integer_set_limb(res, res, n, 2, radix); + + radix_integer_clear(P, radix); + radix_integer_clear(Q, radix); +} + +int main() +{ + slong ndigits; + radix_t radix; + char * s; + + radix_init(radix, 10, 0); + + flint_set_num_threads(1); + + for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10) + { + radix_integer_t x; + arb_t y; + + slong prec_bits = 3.32192809488736 * ndigits + 1; + slong prec_radix = radix_digits_to_limbs(ndigits, radix); + + flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n", + n_clog(ndigits, 10), + prec_bits, prec_radix, prec_radix * radix->exp); + + arb_init(y); + flint_printf("Arb compute sqrt(2): "); + TIMEIT_START; + arb_sqrt_ui(y, 2, prec_bits); + TIMEIT_STOP; + + flint_printf("Arb to string: "); + s = NULL; + TIMEIT_START; + flint_free(s); + s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20); + TIMEIT_STOP; + + flint_printf("%s\n", s); + arb_clear(y); + + radix_integer_init(x, radix); + flint_printf("Radix compute sqrt(2): "); + TIMEIT_START; + radix_integer_fixed_sqrt_1(x, 2, prec_radix, radix); + TIMEIT_STOP; + + flint_printf("Radix to string: "); + s = NULL; + TIMEIT_START; + flint_free(s); + s = radix_integer_get_str_decimal(NULL, x, radix); + TIMEIT_STOP; + + print_condensed(s, ndigits, 20); + radix_integer_clear(x, radix); + } + + for (ndigits = 100000; ndigits <= 1000000000; ndigits *= 10) + { + radix_integer_t x; + arb_t y; + + slong prec_bits = 3.32192809488736 * ndigits + 1; + slong prec_radix = radix_digits_to_limbs(ndigits, radix); + + flint_printf("\n10^%wd digits (%wd bits; %wd decimal limbs = %wd digits)\n", + n_clog(ndigits, 10), + prec_bits, prec_radix, prec_radix * radix->exp); + + arb_init(y); + flint_printf("Arb compute e: "); + TIMEIT_START; + arb_const_e_eval(y, prec_bits); + TIMEIT_STOP; + + flint_printf("Arb to string: "); + s = NULL; + TIMEIT_START; + flint_free(s); + s = arb_get_str(y, ndigits, ARB_STR_CONDENSE * 20); + TIMEIT_STOP; + + flint_printf("%s\n", s); + arb_clear(y); + + radix_integer_init(x, radix); + flint_printf("Radix compute e: "); + TIMEIT_START; + radix_integer_fixed_e(x, prec_radix, radix); + TIMEIT_STOP; + + flint_printf("Radix to string: "); + s = NULL; + TIMEIT_START; + flint_free(s); + s = radix_integer_get_str_decimal(NULL, x, radix); + TIMEIT_STOP; + + print_condensed(s, ndigits, 20); + radix_integer_clear(x, radix); + } + + radix_clear(radix); + flint_cleanup_master(); + return 0; +} + From bbc2c953ae028d7c8dbfa74e057bd52397648a5d Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Thu, 19 Feb 2026 22:35:15 +0100 Subject: [PATCH 4/5] Improve radix_mul_1 --- src/radix/mul_1.c | 49 +++++++---------------------------------------- 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c index 6ab3ce8576..1d2f2d7b4b 100644 --- a/src/radix/mul_1.c +++ b/src/radix/mul_1.c @@ -38,54 +38,20 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix) FLINT_ASSERT(cy < mod.n); return cy; } - else if (sbits <= 2 * FLINT_BITS) - { - ulong cy[2] = { 0, 0, }; - - if (mod.norm == 0) - { - for (i = 0; i < n; i++) - { - umul_ppmm(hi, lo, a[i], c); - add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); - z[i] = flint_mpn_divrem_2_1_preinv_norm(cy, cy, mod.n, mod.ninv); - } - } - else - { - for (i = 0; i < n; i++) - { - umul_ppmm(hi, lo, a[i], c); - add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); - z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); - } - } - - FLINT_ASSERT(cy[0] < mod.n); - FLINT_ASSERT(cy[1] == 0); - return cy[0]; - } else { if (mod.norm == 0) { - ulong r; ulong cy0 = 0; ulong cy1 = 0; - ulong cy2 = 0; - - umul_ppmm(hi, lo, a[0], c); - udiv_qrnnd_preinv(cy0, r, hi, lo, mod.n, mod.ninv); - z[0] = r; + ulong r; - for (i = 1; i < n; i++) + for (i = 0; i < n; i++) { umul_ppmm(hi, lo, a[i], c); - add_sssaaaaaa(cy2, cy1, cy0, cy2, cy1, cy0, 0, hi, lo); - /* cy2 is certainly already reduced */ - udiv_qrnnd_preinv(cy1, r, cy2, cy1, mod.n, mod.ninv); + add_ssaaaa(cy1, cy0, cy1, cy0, hi, lo); + r = n_divrem_norm(&cy1, cy1, mod.n); udiv_qrnnd_preinv(cy0, r, r, cy0, mod.n, mod.ninv); - cy2 = 0; z[i] = r; } @@ -95,18 +61,17 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix) } else { - ulong cy[3] = { 0, 0, 0 }; + ulong cy[2] = { 0, 0, }; for (i = 0; i < n; i++) { umul_ppmm(hi, lo, a[i], c); - add_sssaaaaaa(cy[2], cy[1], cy[0], cy[2], cy[1], cy[0], 0, hi, lo); - z[i] = flint_mpn_divrem_3_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); + add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); + z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); } FLINT_ASSERT(cy[0] < mod.n); FLINT_ASSERT(cy[1] == 0); - FLINT_ASSERT(cy[2] == 0); return cy[0]; } } From 45be05939460ee33b8657e31fec850671c75402f Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Fri, 20 Feb 2026 09:40:14 +0100 Subject: [PATCH 5/5] Inline division code in radix_mul_1 --- src/radix/mul_1.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/radix/mul_1.c b/src/radix/mul_1.c index 1d2f2d7b4b..e82f0aefdf 100644 --- a/src/radix/mul_1.c +++ b/src/radix/mul_1.c @@ -11,6 +11,36 @@ #include "radix.h" +/* Inline version of flint_mpn_divrem_2_1_preinv_unnorm for speed. */ +FLINT_FORCE_INLINE mp_limb_t +flint_mpn_divrem_2_1_preinv_unnorm1(mp_ptr qp, mp_srcptr up, mp_limb_t d, mp_limb_t dinv, unsigned int norm) +{ + mp_limb_t u0, u1, r; + + FLINT_ASSERT(norm >= 1); + + u1 = up[1]; + u0 = up[0]; + if (u1 < d) + { + d <<= norm; + qp[1] = 0; + r = (u1 << norm) | (u0 >> (FLINT_BITS - norm)); + } + else + { + /* Hack: this branch is unreachable, but leaving it in results + in faster code with GCC on Zen 3. */ + d <<= norm; + r = (u1 >> (FLINT_BITS - norm)); + udiv_qrnnd_preinv(qp[1], r, r, (u1 << norm) | (u0 >> (FLINT_BITS - norm)), d, dinv); + } + + udiv_qrnnd_preinv(qp[0], r, r, u0 << norm, d, dinv); + return r >> norm; +} + + ulong radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix) { @@ -67,7 +97,7 @@ radix_mul_1(nn_ptr z, nn_srcptr a, slong n, ulong c, const radix_t radix) { umul_ppmm(hi, lo, a[i], c); add_ssaaaa(cy[1], cy[0], cy[1], cy[0], hi, lo); - z[i] = flint_mpn_divrem_2_1_preinv_unnorm(cy, cy, mod.n, mod.ninv, mod.norm); + z[i] = flint_mpn_divrem_2_1_preinv_unnorm1(cy, cy, mod.n, mod.ninv, mod.norm); } FLINT_ASSERT(cy[0] < mod.n);