From 0ece3a0d6be1c9cccfa5dba197b524db5fd22010 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Sat, 25 Apr 2026 14:42:25 +0200
Subject: [PATCH 1/8] Add Filon integration type for efficient integration
 weighted by spherical Bessel functions

---
 src/SymBoltz.jl |   4 +-
 src/filon.jl    | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/solve.jl    |   1 -
 3 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 src/filon.jl

diff --git a/src/SymBoltz.jl b/src/SymBoltz.jl
index 27e5b754..56604fa9 100644
--- a/src/SymBoltz.jl
+++ b/src/SymBoltz.jl
@@ -11,6 +11,7 @@ using OhMyThreads
 using Base.Threads
 using Setfield
 using StaticArrays
+using NumericalIntegration
 
 # TODO: generate gravity equations
 # TODO: modified gravity: coupled quintessence; DGP, parametrized framework, EFT of LSS, ...
@@ -40,6 +41,7 @@ include("models/curvature.jl")
 include("models/inflation.jl")
 include("models/cosmologies.jl")
 include("solve.jl")
+include("filon.jl")
 include("observables/distances.jl")
 include("observables/fourier.jl")
 include("observables/angular.jl")
@@ -51,7 +53,7 @@ export CosmologyProblem, CosmologySolution
 export background, perturbations, expandeq
 export solve, solvebg, solvept, remake, issuccess, parameter_updater
 export parameters_Planck18
-export spectrum_primordial, spectrum_matter, spectrum_matter_nonlinear, spectrum_cmb, correlation_function, variance_matter, stddev_matter, los_integrate, source_grid, source_grid_adaptive, sound_horizon, distance_luminosity, SphericalBesselCache
+export spectrum_primordial, spectrum_matter, spectrum_matter_nonlinear, spectrum_cmb, correlation_function, variance_matter, stddev_matter, los_integrate, source_grid, source_grid_adaptive, sound_horizon, distance_luminosity, SphericalBesselCache, SphericalBesselIntegralCache
 export express_derivatives
 
 using PrecompileTools: @compile_workload
diff --git a/src/filon.jl b/src/filon.jl
new file mode 100644
index 00000000..4b99a20f
--- /dev/null
+++ b/src/filon.jl
@@ -0,0 +1,127 @@
+using Bessels
+
+struct SphericalBesselIntegralCache{T <: AbstractFloat}
+    l::Vector{Int}
+    x::Vector{T} # uniform grid
+    I0::Matrix{T} # ∫dx jₗ(x) x⁰ cumulatively to this x-point
+    I1::Matrix{T} # ∫dx jₗ(x) x¹ cumulatively to this x-point
+    dI0::Matrix{T} # ∫dx jₗ(x) x⁰ from this to next x-point (0 for last point)
+    dI1::Matrix{T} # ∫dx jₗ(x) x¹ from this to next x-point (0 for last point)
+    invdx::T # 1/dx (constant because grid is uniform)
+    tmp1::Vector{T} # temporary workspace per l
+    tmp2::Vector{T} # temporary workspace per l
+    tmp3::Vector{T} # temporary workspace per l
+    tmp4::Vector{T} # temporary workspace per l
+end
+
+# TODO: adaptively grid size from tolerance parameter? can compare to sin
+function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_integral = 2π/128, method = TrapezoidalEven())
+    Nx = 1 + Int(ceil((x2 - x1) / mindx_grid))
+    xs = range(x1, x2, length=Nx)
+    dx = step(xs)
+    xs = collect(xs)
+
+    mindx_integral ≤ dx ≤ mindx_grid || error("Integration grid must be finer than interpolation grid")
+
+    T = typeof(x1)
+    I0s = zeros(T, length(ls), length(xs)) # TODO: add dummy for final+1 lookup?
+    I1s = zeros(T, length(ls), length(xs))
+    dI0s = zeros(T, length(ls), length(xs))
+    dI1s = zeros(T, length(ls), length(xs))
+
+    Nsubx = 1 + Int(ceil((xs[2] - xs[1]) / mindx_integral))
+    subis = zeros(T, Nsubx)
+
+    @inbounds for il in eachindex(ls)
+        l = ls[il]
+        for ix in 1:length(xs)-1
+            subxs = range(xs[ix], xs[ix+1]; length = Nsubx) # fine integration grid
+            subis .= sphericalbesselj.(l, subxs) # jₗ(x) x⁰
+            dI0s[il, ix] = NumericalIntegration.integrate(subxs, subis, method) # ∫ jₗ(x) x⁰ dx
+            subis .*= subxs # jₗ(x) x¹
+            dI1s[il, ix] = NumericalIntegration.integrate(subxs, subis, method) # ∫ jₗ(x) x¹ dx
+            I0s[il, ix+1] = I0s[il, ix] + dI0s[il, ix] # cumulative
+            I1s[il, ix+1] = I1s[il, ix] + dI1s[il, ix] # cumulative
+        end
+    end
+
+    tmp1 = zeros(T, length(ls))
+    tmp2 = zeros(T, length(ls))
+    tmp3 = zeros(T, length(ls))
+    tmp4 = zeros(T, length(ls))
+    return SphericalBesselIntegralCache{T}(ls, xs, I0s, I1s, dI0s, dI1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
+end
+
+function Base.show(io::IO, jlint::SphericalBesselIntegralCache)
+    print(io, "jₗ(x) integration cache ")
+    print(io, "for $(jlint.l[begin]) ≤ l ≤ $(jlint.l[end]) and ")
+    print(io, "$(jlint.x[begin]) ≤ x ≤ $(jlint.x[end]) ")
+    print(io, "($(Base.format_bytes(Base.summarysize(jlint))))\n")
+end
+
+# TODO: Hermite interpolation, have derivative for free: could use both (I0, w) and (I1, w*x)? https://en.wikipedia.org/wiki/Cubic_Hermite_spline
+Base.@propagate_inbounds function (jlint::SphericalBesselIntegralCache)(out0::AbstractArray{<:AbstractFloat}, out1::AbstractArray{<:AbstractFloat}, x)
+    x1 = jlint.x[begin]
+    ix = 1 + unsafe_trunc(Int, (x-x1)*jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
+    x₋ = jlint.x[ix]
+    w = (x - x₋) * jlint.invdx
+    ibase = (ix - 1) * length(jlint.l)
+    @inbounds @fastmath #=@simd=# for il in eachindex(out0)
+        i = ibase + il
+        I0 = jlint.I0[i]
+        I1 = jlint.I1[i]
+        dI0 = jlint.dI0[i] # TODO: don't look up; keep stored for prev/next point in main integrate loop
+        dI1 = jlint.dI1[i]
+        out0[il] = muladd(w, dI0, I0) # TODO: avoid doing this so many times; make function that computes I(b)-I(a) instead
+        out1[il] = muladd(w, dI1, I1)
+    end
+    return nothing
+end
+
+function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray)
+    xs[2] > xs[1] || error("x-domain is not strictly increasing")
+    xs[begin] ≥ jlint.x[begin] || error("$(xs[begin]) is outside integral cache left bound $(jlint.x[begin])")
+    xs[end] ≤ jlint.x[end] || error("$(xs[end]) is outside integral cache right bound $(jlint.x[end])")
+    out .= 0.0
+    I0₋, I0₊, I1₋, I1₊ = jlint.tmp1, jlint.tmp2, jlint.tmp3, jlint.tmp4 # TODO: thread-unsafe
+    i₋ = 1
+    x₋ = xs[i₋]
+    y₋ = ys[i₋]
+    jlint(I0₋, I1₋, x₋)
+    @inbounds @fastmath for i₊ in 2:length(xs) # TODO: write out and unroll loop to look up jlint(x) once per x and l without setindex?
+        x₊ = xs[i₊]
+        y₊ = ys[i₊]
+        A = (y₊ - y₋) / (x₊ - x₋) # A in y = Ax + B
+        B = y₋ - A*x₋ # B in y = Ax + B
+        # TODO: quadratic; C?
+        jlint(I0₊, I1₊, x₊) # TODO: write out this to avoid l-indexing?
+        #=@simd=# for il in eachindex(out)
+            I0 = I0₊[il] - I0₋[il] # ∫dx jₗ(x) x⁰ from x₋ to x₊
+            I1 = I1₊[il] - I1₋[il] # ∫dx jₗ(x) x¹ from x₋ to x₊ # TODO: interpolate differences instead?
+            out[il] = muladd(A, I1, muladd(B, I0, out[il])) # i.e. out[il] += A*I1 + B*I0
+        end
+        I0₋, I1₋, I0₊, I1₊ = I0₊, I1₊, I0₋, I1₋ # pointer-like swap for next iteration
+        i₋, x₋, y₋ = i₊, x₊, y₊ # pass to next iteration
+    end
+    return out
+end
+
+# Convenience dispatches
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function, x1, x2; kw...)
+    xs = range(x1, x2; kw...)
+    ys = f.(xs)
+    return integrate(out, jlint, xs, ys)
+end
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function, x2; kw...)
+    x1 = jlint.x[begin]
+    return integrate(out, jlint, f, x1, x2; kw...)
+end
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function; kw...)
+    x2 = jlint.x[end]
+    return integrate(out, jlint, f, x2; kw...)
+end
+function integrate(jlint::SphericalBesselIntegralCache, args...; kw...)
+    out = zeros(eltype(jlint.I0), length(jlint.l))
+    integrate(out, jlint, args...; kw...)
+    return out
+end
\ No newline at end of file
diff --git a/src/solve.jl b/src/solve.jl
index 5d2aa058..48dfbb36 100644
--- a/src/solve.jl
+++ b/src/solve.jl
@@ -9,7 +9,6 @@ import OhMyThreads: TaskLocalValue
 import SymbolicIndexingInterface
 import SymbolicIndexingInterface: getsym, setsym_oop, parameter_values
 using RecursiveFactorization # makes RFLUFactorization() available as linear solver: https://docs.sciml.ai/LinearSolve/stable/tutorials/accelerating_choices/
-import NumericalIntegration: cumul_integrate
 using SparseArrays
 import NonlinearSolve.BracketingNonlinearSolve: AbstractBracketingAlgorithm
 

From da33e25a10cd98a63e03afe4190e827d4b4dea66 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Sat, 25 Apr 2026 18:02:09 +0200
Subject: [PATCH 2/8] Test Filon integration

---
 test/runtests.jl | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index f1ed8014..5a47a155 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -717,3 +717,37 @@ end
     ]
     @test isequal(expandeq(eqs, D(ρ); protect = Set(ℋ)), -4ℋ*ρ)
 end
+
+@testset "Filon integration" begin
+    ls = 0:100
+    jlint = SymBoltz.SphericalBesselIntegralCache(ls, 0.0, 100.0; mindx_grid = 2π/128, mindx_integral = 2π/1024, method = SymBoltz.SimpsonEven())
+
+    # Test analytical result exactly at grid points
+    I1s = jlint.I1[1, :]
+    I1s_anal = 1 .- cos.(jlint.x)
+    @test all(isapprox.(I1s, I1s_anal; atol = 1e-10))
+
+    # Test analytical result exactly between grid points (where interpolation error should be largest)
+    xmid = (jlint.x[begin:end-1] .+ jlint.x[begin+1:end]) ./ 2
+    I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], xmid)
+    I1s_anal = 1 .- cos.(xmid)
+    @test all(isapprox.(I1s, I1s_anal; atol = 1e-3))
+
+    # Test that integrals interpolate almost exactly at grid points
+    I0s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [1.0, 1.0])[begin], jlint.x[2:end])
+    I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], jlint.x[2:end])
+    @test all(isapprox.(I0s, jlint.I0[1, 2:end]; atol = 1e-15))
+    @test all(isapprox.(I1s, jlint.I1[1, 2:end]; atol = 1e-13))
+
+    # Test against finer brute-force integral
+    f = x -> SymBoltz.sphericalbesselj(0, x/10.0)
+
+    xs = range(jlint.x[begin], jlint.x[end], length = 1000)
+    ys = f.(xs) .* SymBoltz.sphericalbesselj.(transpose(ls), xs)
+    IA = SymBoltz.NumericalIntegration.integrate(xs, ys, SymBoltz.SimpsonEven())
+
+    xs = range(jlint.x[begin], jlint.x[end], length = 100)
+    ys = f.(xs)
+    IB = SymBoltz.integrate(jlint, xs, ys)
+    @test all(isapprox.(IA, IB; atol = 1e-3))
+end

From 61c4cbbc84f30c3a7a5cea17426c410097a94ae9 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Sat, 25 Apr 2026 23:23:16 +0200
Subject: [PATCH 3/8] Add dispatch for LOS integration with Filon integral
 cache

---
 src/observables/angular.jl | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/observables/angular.jl b/src/observables/angular.jl
index 969fce38..8a138c26 100644
--- a/src/observables/angular.jl
+++ b/src/observables/angular.jl
@@ -171,6 +171,28 @@ function los_integrate(sol::CosmologySolution, ls::AbstractVector, τs::Abstract
     return los_integrate(Ss, ls, τs, ks, jl; kwargs...)
 end
 
+function los_integrate(Ss::AbstractArray{T, 3}, ls::AbstractVector, τs::AbstractVector, ks::AbstractVector, jlint::SphericalBesselIntegralCache; kw...) where {T <: Real}
+    @assert size(Ss, 1) == 1
+    Ss = @view Ss[1, :, :]
+    τ0 = τs[end]
+    reverse!(τs) # reverse, i.e. integrate back in time (jlint set up to handle increasing x)
+    reverse!(Ss, dims=1)
+    Is = similar(Ss, length(ls), length(ks))
+    xs = similar(τs)
+    for ik in eachindex(ks)
+        k = ks[ik]
+        xs .= k .* (τ0 .- τs)
+        ys = @view Ss[:, ik]
+        is = @view Is[:, ik]
+        SymBoltz.integrate(is, jlint, xs, ys)
+        is ./= k # handle change of var!
+    end
+    Is = transpose(Is)
+    reverse!(τs) # undo
+    reverse!(Ss, dims=1) # undo
+    return reshape(Is, (1, size(Is, 1), size(Is, 2)))
+end
+
 # TODO: integrate splines instead of trapz! https://discourse.julialang.org/t/how-to-speed-up-the-numerical-integration-with-interpolation/96223/5
 @doc raw"""
     spectrum_cmb(ΘlAs::AbstractMatrix, ΘlBs::AbstractMatrix, P0s::AbstractVector, ls::AbstractVector, ks::AbstractVector; integrator = TrapezoidalRule(), normalization = :Cl, thread = true)
@@ -209,7 +231,7 @@ function spectrum_cmb(ΘlAs::AbstractMatrix, ΘlBs::AbstractMatrix, P0s::Abstrac
 end
 
 """
-    spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::SphericalBesselCache; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob),, reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
+    spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::Union{SphericalBesselCache, SphericalBesselIntegralCache}; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob),, reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
 
 Compute angular CMB power spectra ``Cₗᴬᴮ`` at angular wavenumbers `ls` from the cosmological problem `prob`.
 The requested `modes` are specified as a vector of symbols in the form `:AB`, where `A` and `B` are `T` (temperature), `E` (E-mode polarization) or `ψ` (lensing).
@@ -235,7 +257,7 @@ modes = [:TT, :TE, :ψψ, :ψT]
 Dls = spectrum_cmb(modes, prob, jl; normalization = :Dl, unit = u"μK")
 ```
 """
-function spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::SphericalBesselCache; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob), reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
+function spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::Union{SphericalBesselCache, SphericalBesselIntegralCache}; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob), reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
     ls = jl.l
     sol = solve(prob; bgopts, verbose)
     τ0 = getsym(sol, prob.M.τ0)(sol)

From 18bd7bcb4a25acae36edffa8a51ad48ec82948f3 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Mon, 27 Apr 2026 10:11:39 +0200
Subject: [PATCH 4/8] Only store cumulative integrals and subtract to get
 difference on each interval

---
 src/filon.jl | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/filon.jl b/src/filon.jl
index 4b99a20f..b9b7d77d 100644
--- a/src/filon.jl
+++ b/src/filon.jl
@@ -5,8 +5,6 @@ struct SphericalBesselIntegralCache{T <: AbstractFloat}
     x::Vector{T} # uniform grid
     I0::Matrix{T} # ∫dx jₗ(x) x⁰ cumulatively to this x-point
     I1::Matrix{T} # ∫dx jₗ(x) x¹ cumulatively to this x-point
-    dI0::Matrix{T} # ∫dx jₗ(x) x⁰ from this to next x-point (0 for last point)
-    dI1::Matrix{T} # ∫dx jₗ(x) x¹ from this to next x-point (0 for last point)
     invdx::T # 1/dx (constant because grid is uniform)
     tmp1::Vector{T} # temporary workspace per l
     tmp2::Vector{T} # temporary workspace per l
@@ -26,8 +24,6 @@ function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_int
     T = typeof(x1)
     I0s = zeros(T, length(ls), length(xs)) # TODO: add dummy for final+1 lookup?
     I1s = zeros(T, length(ls), length(xs))
-    dI0s = zeros(T, length(ls), length(xs))
-    dI1s = zeros(T, length(ls), length(xs))
 
     Nsubx = 1 + Int(ceil((xs[2] - xs[1]) / mindx_integral))
     subis = zeros(T, Nsubx)
@@ -37,11 +33,9 @@ function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_int
         for ix in 1:length(xs)-1
             subxs = range(xs[ix], xs[ix+1]; length = Nsubx) # fine integration grid
             subis .= sphericalbesselj.(l, subxs) # jₗ(x) x⁰
-            dI0s[il, ix] = NumericalIntegration.integrate(subxs, subis, method) # ∫ jₗ(x) x⁰ dx
+            I0s[il, ix+1] = I0s[il, ix] + NumericalIntegration.integrate(subxs, subis, method) # cumulative ∫ jₗ(x) x⁰ dx
             subis .*= subxs # jₗ(x) x¹
-            dI1s[il, ix] = NumericalIntegration.integrate(subxs, subis, method) # ∫ jₗ(x) x¹ dx
-            I0s[il, ix+1] = I0s[il, ix] + dI0s[il, ix] # cumulative
-            I1s[il, ix+1] = I1s[il, ix] + dI1s[il, ix] # cumulative
+            I1s[il, ix+1] = I1s[il, ix] + NumericalIntegration.integrate(subxs, subis, method) # cumulative ∫ jₗ(x) x¹ dx
         end
     end
 
@@ -49,7 +43,7 @@ function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_int
     tmp2 = zeros(T, length(ls))
     tmp3 = zeros(T, length(ls))
     tmp4 = zeros(T, length(ls))
-    return SphericalBesselIntegralCache{T}(ls, xs, I0s, I1s, dI0s, dI1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
+    return SphericalBesselIntegralCache{T}(ls, xs, I0s, I1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
 end
 
 function Base.show(io::IO, jlint::SphericalBesselIntegralCache)
@@ -62,18 +56,17 @@ end
 # TODO: Hermite interpolation, have derivative for free: could use both (I0, w) and (I1, w*x)? https://en.wikipedia.org/wiki/Cubic_Hermite_spline
 Base.@propagate_inbounds function (jlint::SphericalBesselIntegralCache)(out0::AbstractArray{<:AbstractFloat}, out1::AbstractArray{<:AbstractFloat}, x)
     x1 = jlint.x[begin]
-    ix = 1 + unsafe_trunc(Int, (x-x1)*jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
-    x₋ = jlint.x[ix]
+    ix₋ = 1 + unsafe_trunc(Int, (x-x1)*jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
+    ix₊ = min(ix₋ + 1, length(jlint.x))
+    x₋ = jlint.x[ix₋]
     w = (x - x₋) * jlint.invdx
-    ibase = (ix - 1) * length(jlint.l)
     @inbounds @fastmath #=@simd=# for il in eachindex(out0)
-        i = ibase + il
-        I0 = jlint.I0[i]
-        I1 = jlint.I1[i]
-        dI0 = jlint.dI0[i] # TODO: don't look up; keep stored for prev/next point in main integrate loop
-        dI1 = jlint.dI1[i]
-        out0[il] = muladd(w, dI0, I0) # TODO: avoid doing this so many times; make function that computes I(b)-I(a) instead
-        out1[il] = muladd(w, dI1, I1)
+        I0₋ = jlint.I0[il, ix₋]
+        I1₋ = jlint.I1[il, ix₋]
+        I0₊ = jlint.I0[il, ix₊]
+        I1₊ = jlint.I1[il, ix₊]
+        out0[il] = muladd(w, I0₊ - I0₋, I0₋)
+        out1[il] = muladd(w, I1₊ - I1₋, I1₋)
     end
     return nothing
 end

From f6d18a0a75651ba90e03c67628e12747f03e784f Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Mon, 27 Apr 2026 11:55:41 +0200
Subject: [PATCH 5/8] Use Hermite interpolation for Filon integrals

---
 src/filon.jl | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/filon.jl b/src/filon.jl
index b9b7d77d..458500a7 100644
--- a/src/filon.jl
+++ b/src/filon.jl
@@ -3,6 +3,7 @@ using Bessels
 struct SphericalBesselIntegralCache{T <: AbstractFloat}
     l::Vector{Int}
     x::Vector{T} # uniform grid
+    w::Matrix{T} # jₗ(x) at this x-point
     I0::Matrix{T} # ∫dx jₗ(x) x⁰ cumulatively to this x-point
     I1::Matrix{T} # ∫dx jₗ(x) x¹ cumulatively to this x-point
     invdx::T # 1/dx (constant because grid is uniform)
@@ -13,7 +14,7 @@ struct SphericalBesselIntegralCache{T <: AbstractFloat}
 end
 
 # TODO: adaptively grid size from tolerance parameter? can compare to sin
-function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_integral = 2π/128, method = TrapezoidalEven())
+function SphericalBesselIntegralCache(ls, x1 = 0.0, x2 = 10*ls[end]; mindx_grid = 2π/26, mindx_integral = 2π/2048, method = SimpsonEven())
     Nx = 1 + Int(ceil((x2 - x1) / mindx_grid))
     xs = range(x1, x2, length=Nx)
     dx = step(xs)
@@ -24,6 +25,7 @@ function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_int
     T = typeof(x1)
     I0s = zeros(T, length(ls), length(xs)) # TODO: add dummy for final+1 lookup?
     I1s = zeros(T, length(ls), length(xs))
+    ws = zeros(T, length(ls), length(xs))
 
     Nsubx = 1 + Int(ceil((xs[2] - xs[1]) / mindx_integral))
     subis = zeros(T, Nsubx)
@@ -37,13 +39,14 @@ function SphericalBesselIntegralCache(ls, x1, x2; mindx_grid = 2π/32, mindx_int
             subis .*= subxs # jₗ(x) x¹
             I1s[il, ix+1] = I1s[il, ix] + NumericalIntegration.integrate(subxs, subis, method) # cumulative ∫ jₗ(x) x¹ dx
         end
+        ws[il, :] = sphericalbesselj.(l, xs)
     end
 
     tmp1 = zeros(T, length(ls))
     tmp2 = zeros(T, length(ls))
     tmp3 = zeros(T, length(ls))
     tmp4 = zeros(T, length(ls))
-    return SphericalBesselIntegralCache{T}(ls, xs, I0s, I1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
+    return SphericalBesselIntegralCache{T}(ls, xs, ws, I0s, I1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
 end
 
 function Base.show(io::IO, jlint::SphericalBesselIntegralCache)
@@ -54,24 +57,36 @@ function Base.show(io::IO, jlint::SphericalBesselIntegralCache)
 end
 
 # TODO: Hermite interpolation, have derivative for free: could use both (I0, w) and (I1, w*x)? https://en.wikipedia.org/wiki/Cubic_Hermite_spline
-Base.@propagate_inbounds function (jlint::SphericalBesselIntegralCache)(out0::AbstractArray{<:AbstractFloat}, out1::AbstractArray{<:AbstractFloat}, x)
+Base.@propagate_inbounds @fastmath function (jlint::SphericalBesselIntegralCache)(out0::AbstractArray{<:AbstractFloat}, out1::AbstractArray{<:AbstractFloat}, x)
     x1 = jlint.x[begin]
-    ix₋ = 1 + unsafe_trunc(Int, (x-x1)*jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
+    ix₋ = 1 + unsafe_trunc(Int, (x - x1) * jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
     ix₊ = min(ix₋ + 1, length(jlint.x))
     x₋ = jlint.x[ix₋]
-    w = (x - x₋) * jlint.invdx
-    @inbounds @fastmath #=@simd=# for il in eachindex(out0)
+    x₊ = jlint.x[ix₊]
+    dx = x₊ - x₋
+    t = (x - x₋) * jlint.invdx
+    h00 = (1 + 2t) * (1 - t)^2 # https://en.wikipedia.org/wiki/Cubic_Hermite_spline
+    h01 = t^2 * (3 - 2t)
+    h10 = t * (1 - t)^2 * dx # derivatives scaled by dx to map from x to t ∈ [0, 1]
+    h11 = t^2 * (t - 1) * dx # derivatives scaled by dx to map from x to t ∈ [0, 1]
+    @inbounds #=@simd=# for il in eachindex(out0)
+        w₋ = jlint.w[il, ix₋]
         I0₋ = jlint.I0[il, ix₋]
         I1₋ = jlint.I1[il, ix₋]
+        I0₋′ = w₋
+        I1₋′ = w₋ * x₋
+        w₊ = jlint.w[il, ix₊]
         I0₊ = jlint.I0[il, ix₊]
         I1₊ = jlint.I1[il, ix₊]
-        out0[il] = muladd(w, I0₊ - I0₋, I0₋)
-        out1[il] = muladd(w, I1₊ - I1₋, I1₋)
+        I0₊′ = w₊
+        I1₊′ = w₊ * x₊
+        out0[il] = h00 * I0₋ + h10 * I0₋′ + h01 * I0₊ + h11 * I0₊′
+        out1[il] = h00 * I1₋ + h10 * I1₋′ + h01 * I1₊ + h11 * I1₊′
     end
     return nothing
 end
 
-function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray)
+@fastmath function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray)
     xs[2] > xs[1] || error("x-domain is not strictly increasing")
     xs[begin] ≥ jlint.x[begin] || error("$(xs[begin]) is outside integral cache left bound $(jlint.x[begin])")
     xs[end] ≤ jlint.x[end] || error("$(xs[end]) is outside integral cache right bound $(jlint.x[end])")
@@ -81,7 +96,8 @@ function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray,
     x₋ = xs[i₋]
     y₋ = ys[i₋]
     jlint(I0₋, I1₋, x₋)
-    @inbounds @fastmath for i₊ in 2:length(xs) # TODO: write out and unroll loop to look up jlint(x) once per x and l without setindex?
+    @inbounds while i₋ < length(xs) # TODO: write out and unroll loop to look up jlint(x) once per x and l without setindex?
+        i₊ = i₋ + 1
         x₊ = xs[i₊]
         y₊ = ys[i₊]
         A = (y₊ - y₋) / (x₊ - x₋) # A in y = Ax + B
@@ -91,7 +107,7 @@ function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray,
         #=@simd=# for il in eachindex(out)
             I0 = I0₊[il] - I0₋[il] # ∫dx jₗ(x) x⁰ from x₋ to x₊
             I1 = I1₊[il] - I1₋[il] # ∫dx jₗ(x) x¹ from x₋ to x₊ # TODO: interpolate differences instead?
-            out[il] = muladd(A, I1, muladd(B, I0, out[il])) # i.e. out[il] += A*I1 + B*I0
+            out[il] += A*I1 + B*I0
         end
         I0₋, I1₋, I0₊, I1₊ = I0₊, I1₊, I0₋, I1₋ # pointer-like swap for next iteration
         i₋, x₋, y₋ = i₊, x₊, y₊ # pass to next iteration

From 4383217657c8a9867928f211d0733e6230ebeb13 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Mon, 27 Apr 2026 12:07:26 +0200
Subject: [PATCH 6/8] Update Filon integration tests

---
 test/runtests.jl | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5a47a155..e3c5cc6a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -719,10 +719,10 @@ end
 end
 
 @testset "Filon integration" begin
-    ls = 0:100
-    jlint = SymBoltz.SphericalBesselIntegralCache(ls, 0.0, 100.0; mindx_grid = 2π/128, mindx_integral = 2π/1024, method = SymBoltz.SimpsonEven())
+    ls = 0:10:1000
+    jlint = SymBoltz.SphericalBesselIntegralCache(ls, 0.0, 1000.0; mindx_grid = 2π/26, mindx_integral = 2π/2048, method = SymBoltz.SimpsonEven())
 
-    # Test analytical result exactly at grid points
+    # Test analytical result exactly at grid points (depends on mindx_integral)
     I1s = jlint.I1[1, :]
     I1s_anal = 1 .- cos.(jlint.x)
     @test all(isapprox.(I1s, I1s_anal; atol = 1e-10))
@@ -731,23 +731,23 @@ end
     xmid = (jlint.x[begin:end-1] .+ jlint.x[begin+1:end]) ./ 2
     I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], xmid)
     I1s_anal = 1 .- cos.(xmid)
-    @test all(isapprox.(I1s, I1s_anal; atol = 1e-3))
+    @test all(isapprox.(I1s, I1s_anal; atol = 1e-5))
 
-    # Test that integrals interpolate almost exactly at grid points
+    # Test that integrals interpolate exactly at grid points
     I0s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [1.0, 1.0])[begin], jlint.x[2:end])
     I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], jlint.x[2:end])
-    @test all(isapprox.(I0s, jlint.I0[1, 2:end]; atol = 1e-15))
-    @test all(isapprox.(I1s, jlint.I1[1, 2:end]; atol = 1e-13))
+    @test all(isequal.(I0s, jlint.I0[1, 2:end]))
+    @test all(isequal.(I1s, jlint.I1[1, 2:end]))
 
     # Test against finer brute-force integral
     f = x -> SymBoltz.sphericalbesselj(0, x/10.0)
 
-    xs = range(jlint.x[begin], jlint.x[end], length = 1000)
+    xs = range(jlint.x[begin], jlint.x[end], step = 2π/1024)
     ys = f.(xs) .* SymBoltz.sphericalbesselj.(transpose(ls), xs)
-    IA = SymBoltz.NumericalIntegration.integrate(xs, ys, SymBoltz.SimpsonEven())
+    IA = SymBoltz.NumericalIntegration.integrate(xs, ys, SymBoltz.TrapezoidalEven())
 
-    xs = range(jlint.x[begin], jlint.x[end], length = 100)
+    xs = range(jlint.x[begin], jlint.x[end], step = 2π/128)
     ys = f.(xs)
     IB = SymBoltz.integrate(jlint, xs, ys)
-    @test all(isapprox.(IA, IB; atol = 1e-3))
+    @test all(isapprox.(IA, IB; atol = 1e-6))
 end

From bd9e5efdc3aee548f1a21ceb855980a7063622c1 Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Mon, 27 Apr 2026 13:52:53 +0200
Subject: [PATCH 7/8] Make Filon integration multithreaded

---
 src/filon.jl               |  5 +++--
 src/observables/angular.jl | 17 ++++++++++++-----
 test/runtests.jl           | 10 ++++++++++
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/filon.jl b/src/filon.jl
index 458500a7..e5bb9b98 100644
--- a/src/filon.jl
+++ b/src/filon.jl
@@ -86,12 +86,13 @@ Base.@propagate_inbounds @fastmath function (jlint::SphericalBesselIntegralCache
     return nothing
 end
 
-@fastmath function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray)
+@fastmath function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray, I0₋ = jlint.tmp1, I0₊ = jlint.tmp2, I1₋ = jlint.tmp3, I1₊ = jlint.tmp4; thread = false)
     xs[2] > xs[1] || error("x-domain is not strictly increasing")
     xs[begin] ≥ jlint.x[begin] || error("$(xs[begin]) is outside integral cache left bound $(jlint.x[begin])")
     xs[end] ≤ jlint.x[end] || error("$(xs[end]) is outside integral cache right bound $(jlint.x[end])")
+    size(I0₋) == size(I0₊) == size(I1₋) == size(I1₊) == size(jlint.l) || error("Workspaces have wrong size")
+    thread && I0₋ === jlint.tmp1 && I0₊ === jlint.tmp2 && I1₋ === jlint.tmp3 && I1₊ === jlint.tmp4 && error("Multithreading requires task-local workspaces")
     out .= 0.0
-    I0₋, I0₊, I1₋, I1₊ = jlint.tmp1, jlint.tmp2, jlint.tmp3, jlint.tmp4 # TODO: thread-unsafe
     i₋ = 1
     x₋ = xs[i₋]
     y₋ = ys[i₋]
diff --git a/src/observables/angular.jl b/src/observables/angular.jl
index 8a138c26..42202a37 100644
--- a/src/observables/angular.jl
+++ b/src/observables/angular.jl
@@ -171,20 +171,27 @@ function los_integrate(sol::CosmologySolution, ls::AbstractVector, τs::Abstract
     return los_integrate(Ss, ls, τs, ks, jl; kwargs...)
 end
 
-function los_integrate(Ss::AbstractArray{T, 3}, ls::AbstractVector, τs::AbstractVector, ks::AbstractVector, jlint::SphericalBesselIntegralCache; kw...) where {T <: Real}
+function los_integrate(Ss::AbstractArray{T, 3}, ls::AbstractVector, τs::AbstractVector, ks::AbstractVector, jlint::SphericalBesselIntegralCache; thread = true, kw...) where {T <: Real}
     @assert size(Ss, 1) == 1
     Ss = @view Ss[1, :, :]
     τ0 = τs[end]
     reverse!(τs) # reverse, i.e. integrate back in time (jlint set up to handle increasing x)
     reverse!(Ss, dims=1)
-    Is = similar(Ss, length(ls), length(ks))
-    xs = similar(τs)
-    for ik in eachindex(ks)
+    Is = zeros(eltype(Ss), length(ls), length(ks))
+    @localize Is @tasks for ik in eachindex(ks) # parallellize independent loop iterations
+        @set scheduler = thread ? :dynamic : :serial
+        @local begin # define task-local values (declared once for all loop iterations)
+            xs = similar(τs)
+            tmp1 = similar(Ss, length(ls))
+            tmp2 = similar(Ss, length(ls))
+            tmp3 = similar(Ss, length(ls))
+            tmp4 = similar(Ss, length(ls))
+        end
         k = ks[ik]
         xs .= k .* (τ0 .- τs)
         ys = @view Ss[:, ik]
         is = @view Is[:, ik]
-        SymBoltz.integrate(is, jlint, xs, ys)
+        SymBoltz.integrate(is, jlint, xs, ys, tmp1, tmp2, tmp3, tmp4)
         is ./= k # handle change of var!
     end
     Is = transpose(Is)
diff --git a/test/runtests.jl b/test/runtests.jl
index e3c5cc6a..bece9de0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -750,4 +750,14 @@ end
     ys = f.(xs)
     IB = SymBoltz.integrate(jlint, xs, ys)
     @test all(isapprox.(IA, IB; atol = 1e-6))
+
+    # Multithreading / workspaces
+    tmp1 = zeros(length(jlint.l))
+    tmp2 = zeros(length(jlint.l))
+    tmp3 = zeros(length(jlint.l))
+    tmp4 = zeros(length(jlint.l))
+    @test SymBoltz.integrate(jlint, [0.0, π], [0.0, π], tmp1, tmp2, tmp3, tmp4; thread = true)[begin] ≈ 2.0
+    tmp3 = zeros(length(jlint.l)+1)
+    @test_throws "wrong size" SymBoltz.integrate(jlint, [0.0, π], [0.0, π], tmp1, tmp2, tmp3, tmp4)
+    @test_throws "requires task-local" SymBoltz.integrate(jlint, [0.0, 1.0], [0.0, 1.0]; thread = true)
 end

From 9e1c5f5f2dc1f063cbb7f78d031daef1652adf2e Mon Sep 17 00:00:00 2001
From: Herman Sletmoen <hermansletmoen@gmail.com>
Date: Mon, 27 Apr 2026 13:58:53 +0200
Subject: [PATCH 8/8] Compute CMB power spectrum with Filon integration in
 CLASS comparison

---
 docs/src/comparison.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/comparison.md b/docs/src/comparison.md
index 499388de..ebeefba6 100644
--- a/docs/src/comparison.md
+++ b/docs/src/comparison.md
@@ -413,7 +413,8 @@ end
 l = 20:20:2000 # CLASS default is lmax = 2500
 Dl1 = Dl_class([:TT, :TE, :EE, :phiphi, :TPhi, :Ephi], l, pars)
 jl = SphericalBesselCache(l)
-Dl2 = Dl_symboltz([:TT, :TE, :EE, :ψψ, :ψT, :ψE], jl, pars)
+jlint = SphericalBesselIntegralCache(l)
+Dl2 = Dl_symboltz([:TT, :TE, :EE, :ψψ, :ψT, :ψE], jlint, pars)
 plot_compare(l, l, Dl1[:, 1], Dl2[:, 1], "l", "Dₗ(TT)"; tol = 2e-12)
 ```
 ```@example class