hersle · hersle · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 27, 2026
diff --git a/docs/src/comparison.md b/docs/src/comparison.md
@@ -413,7 +413,8 @@ end
 l = 20:20:2000 # CLASS default is lmax = 2500
 Dl1 = Dl_class([:TT, :TE, :EE, :phiphi, :TPhi, :Ephi], l, pars)
 jl = SphericalBesselCache(l)
-Dl2 = Dl_symboltz([:TT, :TE, :EE, :ψψ, :ψT, :ψE], jl, pars)
+jlint = SphericalBesselIntegralCache(l)
+Dl2 = Dl_symboltz([:TT, :TE, :EE, :ψψ, :ψT, :ψE], jlint, pars)
 plot_compare(l, l, Dl1[:, 1], Dl2[:, 1], "l", "Dₗ(TT)"; tol = 2e-12)
 ```
 ```@example class

diff --git a/src/SymBoltz.jl b/src/SymBoltz.jl
@@ -11,6 +11,7 @@ using OhMyThreads
 using Base.Threads
 using Setfield
 using StaticArrays
+using NumericalIntegration
 
 # TODO: generate gravity equations
 # TODO: modified gravity: coupled quintessence; DGP, parametrized framework, EFT of LSS, ...
@@ -40,6 +41,7 @@ include("models/curvature.jl")
 include("models/inflation.jl")
 include("models/cosmologies.jl")
 include("solve.jl")
+include("filon.jl")
 include("observables/distances.jl")
 include("observables/fourier.jl")
 include("observables/angular.jl")
@@ -51,7 +53,7 @@ export CosmologyProblem, CosmologySolution
 export background, perturbations, expandeq
 export solve, solvebg, solvept, remake, issuccess, parameter_updater
 export parameters_Planck18
-export spectrum_primordial, spectrum_matter, spectrum_matter_nonlinear, spectrum_cmb, correlation_function, variance_matter, stddev_matter, los_integrate, source_grid, source_grid_adaptive, sound_horizon, distance_luminosity, SphericalBesselCache
+export spectrum_primordial, spectrum_matter, spectrum_matter_nonlinear, spectrum_cmb, correlation_function, variance_matter, stddev_matter, los_integrate, source_grid, source_grid_adaptive, sound_horizon, distance_luminosity, SphericalBesselCache, SphericalBesselIntegralCache
 export express_derivatives
 
 using PrecompileTools: @compile_workload

diff --git a/src/filon.jl b/src/filon.jl
@@ -0,0 +1,137 @@
+using Bessels
+
+struct SphericalBesselIntegralCache{T <: AbstractFloat}
+    l::Vector{Int}
+    x::Vector{T} # uniform grid
+    w::Matrix{T} # jₗ(x) at this x-point
+    I0::Matrix{T} # ∫dx jₗ(x) x⁰ cumulatively to this x-point
+    I1::Matrix{T} # ∫dx jₗ(x) x¹ cumulatively to this x-point
+    invdx::T # 1/dx (constant because grid is uniform)
+    tmp1::Vector{T} # temporary workspace per l
+    tmp2::Vector{T} # temporary workspace per l
+    tmp3::Vector{T} # temporary workspace per l
+    tmp4::Vector{T} # temporary workspace per l
+end
+
+# TODO: adaptively grid size from tolerance parameter? can compare to sin
+function SphericalBesselIntegralCache(ls, x1 = 0.0, x2 = 10*ls[end]; mindx_grid = 2π/26, mindx_integral = 2π/2048, method = SimpsonEven())
+    Nx = 1 + Int(ceil((x2 - x1) / mindx_grid))
+    xs = range(x1, x2, length=Nx)
+    dx = step(xs)
+    xs = collect(xs)
+
+    mindx_integral ≤ dx ≤ mindx_grid || error("Integration grid must be finer than interpolation grid")
+
+    T = typeof(x1)
+    I0s = zeros(T, length(ls), length(xs)) # TODO: add dummy for final+1 lookup?
+    I1s = zeros(T, length(ls), length(xs))
+    ws = zeros(T, length(ls), length(xs))
+
+    Nsubx = 1 + Int(ceil((xs[2] - xs[1]) / mindx_integral))
+    subis = zeros(T, Nsubx)
+
+    @inbounds for il in eachindex(ls)
+        l = ls[il]
+        for ix in 1:length(xs)-1
+            subxs = range(xs[ix], xs[ix+1]; length = Nsubx) # fine integration grid
+            subis .= sphericalbesselj.(l, subxs) # jₗ(x) x⁰
+            I0s[il, ix+1] = I0s[il, ix] + NumericalIntegration.integrate(subxs, subis, method) # cumulative ∫ jₗ(x) x⁰ dx
+            subis .*= subxs # jₗ(x) x¹
+            I1s[il, ix+1] = I1s[il, ix] + NumericalIntegration.integrate(subxs, subis, method) # cumulative ∫ jₗ(x) x¹ dx
+        end
+        ws[il, :] = sphericalbesselj.(l, xs)
+    end
+
+    tmp1 = zeros(T, length(ls))
+    tmp2 = zeros(T, length(ls))
+    tmp3 = zeros(T, length(ls))
+    tmp4 = zeros(T, length(ls))
+    return SphericalBesselIntegralCache{T}(ls, xs, ws, I0s, I1s, 1.0/dx, tmp1, tmp2, tmp3, tmp4)
+end
+
+function Base.show(io::IO, jlint::SphericalBesselIntegralCache)
+    print(io, "jₗ(x) integration cache ")
+    print(io, "for $(jlint.l[begin]) ≤ l ≤ $(jlint.l[end]) and ")
+    print(io, "$(jlint.x[begin]) ≤ x ≤ $(jlint.x[end]) ")
+    print(io, "($(Base.format_bytes(Base.summarysize(jlint))))\n")
+end
+
+# TODO: Hermite interpolation, have derivative for free: could use both (I0, w) and (I1, w*x)? https://en.wikipedia.org/wiki/Cubic_Hermite_spline
+Base.@propagate_inbounds @fastmath function (jlint::SphericalBesselIntegralCache)(out0::AbstractArray{<:AbstractFloat}, out1::AbstractArray{<:AbstractFloat}, x)
+    x1 = jlint.x[begin]
+    ix₋ = 1 + unsafe_trunc(Int, (x - x1) * jlint.invdx) # O(1) uniform grid lookup (corresponding to left point) # TODO: floor(Int, instead?
+    ix₊ = min(ix₋ + 1, length(jlint.x))
+    x₋ = jlint.x[ix₋]
+    x₊ = jlint.x[ix₊]
+    dx = x₊ - x₋
+    t = (x - x₋) * jlint.invdx
+    h00 = (1 + 2t) * (1 - t)^2 # https://en.wikipedia.org/wiki/Cubic_Hermite_spline
+    h01 = t^2 * (3 - 2t)
+    h10 = t * (1 - t)^2 * dx # derivatives scaled by dx to map from x to t ∈ [0, 1]
+    h11 = t^2 * (t - 1) * dx # derivatives scaled by dx to map from x to t ∈ [0, 1]
+    @inbounds #=@simd=# for il in eachindex(out0)
+        w₋ = jlint.w[il, ix₋]
+        I0₋ = jlint.I0[il, ix₋]
+        I1₋ = jlint.I1[il, ix₋]
+        I0₋′ = w₋
+        I1₋′ = w₋ * x₋
+        w₊ = jlint.w[il, ix₊]
+        I0₊ = jlint.I0[il, ix₊]
+        I1₊ = jlint.I1[il, ix₊]
+        I0₊′ = w₊
+        I1₊′ = w₊ * x₊
+        out0[il] = h00 * I0₋ + h10 * I0₋′ + h01 * I0₊ + h11 * I0₊′
+        out1[il] = h00 * I1₋ + h10 * I1₋′ + h01 * I1₊ + h11 * I1₊′
+    end
+    return nothing
+end
+
+@fastmath function integrate(out, jlint::SphericalBesselIntegralCache, xs::AbstractArray, ys::AbstractArray, I0₋ = jlint.tmp1, I0₊ = jlint.tmp2, I1₋ = jlint.tmp3, I1₊ = jlint.tmp4; thread = false)
+    xs[2] > xs[1] || error("x-domain is not strictly increasing")
+    xs[begin] ≥ jlint.x[begin] || error("$(xs[begin]) is outside integral cache left bound $(jlint.x[begin])")
+    xs[end] ≤ jlint.x[end] || error("$(xs[end]) is outside integral cache right bound $(jlint.x[end])")
+    size(I0₋) == size(I0₊) == size(I1₋) == size(I1₊) == size(jlint.l) || error("Workspaces have wrong size")
+    thread && I0₋ === jlint.tmp1 && I0₊ === jlint.tmp2 && I1₋ === jlint.tmp3 && I1₊ === jlint.tmp4 && error("Multithreading requires task-local workspaces")
+    out .= 0.0
+    i₋ = 1
+    x₋ = xs[i₋]
+    y₋ = ys[i₋]
+    jlint(I0₋, I1₋, x₋)
+    @inbounds while i₋ < length(xs) # TODO: write out and unroll loop to look up jlint(x) once per x and l without setindex?
+        i₊ = i₋ + 1
+        x₊ = xs[i₊]
+        y₊ = ys[i₊]
+        A = (y₊ - y₋) / (x₊ - x₋) # A in y = Ax + B
+        B = y₋ - A*x₋ # B in y = Ax + B
+        # TODO: quadratic; C?
+        jlint(I0₊, I1₊, x₊) # TODO: write out this to avoid l-indexing?
+        #=@simd=# for il in eachindex(out)
+            I0 = I0₊[il] - I0₋[il] # ∫dx jₗ(x) x⁰ from x₋ to x₊
+            I1 = I1₊[il] - I1₋[il] # ∫dx jₗ(x) x¹ from x₋ to x₊ # TODO: interpolate differences instead?
+            out[il] += A*I1 + B*I0
+        end
+        I0₋, I1₋, I0₊, I1₊ = I0₊, I1₊, I0₋, I1₋ # pointer-like swap for next iteration
+        i₋, x₋, y₋ = i₊, x₊, y₊ # pass to next iteration
+    end
+    return out
+end
+
+# Convenience dispatches
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function, x1, x2; kw...)
+    xs = range(x1, x2; kw...)
+    ys = f.(xs)
+    return integrate(out, jlint, xs, ys)
+end
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function, x2; kw...)
+    x1 = jlint.x[begin]
+    return integrate(out, jlint, f, x1, x2; kw...)
+end
+function integrate(out, jlint::SphericalBesselIntegralCache, f::Function; kw...)
+    x2 = jlint.x[end]
+    return integrate(out, jlint, f, x2; kw...)
+end
+function integrate(jlint::SphericalBesselIntegralCache, args...; kw...)
+    out = zeros(eltype(jlint.I0), length(jlint.l))
+    integrate(out, jlint, args...; kw...)
+    return out
+end
diff --git a/src/observables/angular.jl b/src/observables/angular.jl
@@ -171,6 +171,35 @@ function los_integrate(sol::CosmologySolution, ls::AbstractVector, τs::Abstract
     return los_integrate(Ss, ls, τs, ks, jl; kwargs...)
 end
 
+function los_integrate(Ss::AbstractArray{T, 3}, ls::AbstractVector, τs::AbstractVector, ks::AbstractVector, jlint::SphericalBesselIntegralCache; thread = true, kw...) where {T <: Real}
+    @assert size(Ss, 1) == 1
+    Ss = @view Ss[1, :, :]
+    τ0 = τs[end]
+    reverse!(τs) # reverse, i.e. integrate back in time (jlint set up to handle increasing x)
+    reverse!(Ss, dims=1)
+    Is = zeros(eltype(Ss), length(ls), length(ks))
+    @localize Is @tasks for ik in eachindex(ks) # parallellize independent loop iterations
+        @set scheduler = thread ? :dynamic : :serial
+        @local begin # define task-local values (declared once for all loop iterations)
+            xs = similar(τs)
+            tmp1 = similar(Ss, length(ls))
+            tmp2 = similar(Ss, length(ls))
+            tmp3 = similar(Ss, length(ls))
+            tmp4 = similar(Ss, length(ls))
+        end
+        k = ks[ik]
+        xs .= k .* (τ0 .- τs)
+        ys = @view Ss[:, ik]
+        is = @view Is[:, ik]
+        SymBoltz.integrate(is, jlint, xs, ys, tmp1, tmp2, tmp3, tmp4)
+        is ./= k # handle change of var!
+    end
+    Is = transpose(Is)
+    reverse!(τs) # undo
+    reverse!(Ss, dims=1) # undo
+    return reshape(Is, (1, size(Is, 1), size(Is, 2)))
+end
+
 # TODO: integrate splines instead of trapz! https://discourse.julialang.org/t/how-to-speed-up-the-numerical-integration-with-interpolation/96223/5
 @doc raw"""
     spectrum_cmb(ΘlAs::AbstractMatrix, ΘlBs::AbstractMatrix, P0s::AbstractVector, ls::AbstractVector, ks::AbstractVector; integrator = TrapezoidalRule(), normalization = :Cl, thread = true)
@@ -209,7 +238,7 @@ function spectrum_cmb(ΘlAs::AbstractMatrix, ΘlBs::AbstractMatrix, P0s::Abstrac
 end
 
 """
-    spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::SphericalBesselCache; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob),, reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
+    spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::Union{SphericalBesselCache, SphericalBesselIntegralCache}; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob),, reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
 
 Compute angular CMB power spectra ``Cₗᴬᴮ`` at angular wavenumbers `ls` from the cosmological problem `prob`.
 The requested `modes` are specified as a vector of symbols in the form `:AB`, where `A` and `B` are `T` (temperature), `E` (E-mode polarization) or `ψ` (lensing).
@@ -235,7 +264,7 @@ modes = [:TT, :TE, :ψψ, :ψT]
 Dls = spectrum_cmb(modes, prob, jl; normalization = :Dl, unit = u"μK")
 ```
 """
-function spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::SphericalBesselCache; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob), reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
+function spectrum_cmb(modes::AbstractVector{<:Symbol}, prob::CosmologyProblem, jl::Union{SphericalBesselCache, SphericalBesselIntegralCache}; normalization = :Cl, unit = nothing, kτ0s = 0.1*jl.l[begin]:2π/2:10*jl.l[end], xs = 0.0:0.0008:1.0, l_limber = 50, integrator = TrapezoidalRule(), bgopts = (alg = bgalg(prob), reltol = 1e-7, abstol = 1e-7), ptopts = (alg = ptalg(prob), reltol = 1e-5, abstol = 1e-5), sourceopts = (rtol = 1e-3, atol = 0.9), coarse_length = 9, thread = true, verbose = false, kwargs...)
     ls = jl.l
     sol = solve(prob; bgopts, verbose)
     τ0 = getsym(sol, prob.M.τ0)(sol)

diff --git a/src/solve.jl b/src/solve.jl
@@ -9,7 +9,6 @@ import OhMyThreads: TaskLocalValue
 import SymbolicIndexingInterface
 import SymbolicIndexingInterface: getsym, setsym_oop, parameter_values
 using RecursiveFactorization # makes RFLUFactorization() available as linear solver: https://docs.sciml.ai/LinearSolve/stable/tutorials/accelerating_choices/
-import NumericalIntegration: cumul_integrate
 using SparseArrays
 import NonlinearSolve.BracketingNonlinearSolve: AbstractBracketingAlgorithm
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -717,3 +717,47 @@ end
     ]
     @test isequal(expandeq(eqs, D(ρ); protect = Set(ℋ)), -4ℋ*ρ)
 end
+
+@testset "Filon integration" begin
+    ls = 0:10:1000
+    jlint = SymBoltz.SphericalBesselIntegralCache(ls, 0.0, 1000.0; mindx_grid = 2π/26, mindx_integral = 2π/2048, method = SymBoltz.SimpsonEven())
+
+    # Test analytical result exactly at grid points (depends on mindx_integral)
+    I1s = jlint.I1[1, :]
+    I1s_anal = 1 .- cos.(jlint.x)
+    @test all(isapprox.(I1s, I1s_anal; atol = 1e-10))
+
+    # Test analytical result exactly between grid points (where interpolation error should be largest)
+    xmid = (jlint.x[begin:end-1] .+ jlint.x[begin+1:end]) ./ 2
+    I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], xmid)
+    I1s_anal = 1 .- cos.(xmid)
+    @test all(isapprox.(I1s, I1s_anal; atol = 1e-5))
+
+    # Test that integrals interpolate exactly at grid points
+    I0s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [1.0, 1.0])[begin], jlint.x[2:end])
+    I1s = map(x -> SymBoltz.integrate(jlint, [0.0, x], [0.0, x])[begin], jlint.x[2:end])
+    @test all(isequal.(I0s, jlint.I0[1, 2:end]))
+    @test all(isequal.(I1s, jlint.I1[1, 2:end]))
+
+    # Test against finer brute-force integral
+    f = x -> SymBoltz.sphericalbesselj(0, x/10.0)
+
+    xs = range(jlint.x[begin], jlint.x[end], step = 2π/1024)
+    ys = f.(xs) .* SymBoltz.sphericalbesselj.(transpose(ls), xs)
+    IA = SymBoltz.NumericalIntegration.integrate(xs, ys, SymBoltz.TrapezoidalEven())
+
+    xs = range(jlint.x[begin], jlint.x[end], step = 2π/128)
+    ys = f.(xs)
+    IB = SymBoltz.integrate(jlint, xs, ys)
+    @test all(isapprox.(IA, IB; atol = 1e-6))
+
+    # Multithreading / workspaces
+    tmp1 = zeros(length(jlint.l))
+    tmp2 = zeros(length(jlint.l))
+    tmp3 = zeros(length(jlint.l))
+    tmp4 = zeros(length(jlint.l))
+    @test SymBoltz.integrate(jlint, [0.0, π], [0.0, π], tmp1, tmp2, tmp3, tmp4; thread = true)[begin] ≈ 2.0
+    tmp3 = zeros(length(jlint.l)+1)
+    @test_throws "wrong size" SymBoltz.integrate(jlint, [0.0, π], [0.0, π], tmp1, tmp2, tmp3, tmp4)
+    @test_throws "requires task-local" SymBoltz.integrate(jlint, [0.0, 1.0], [0.0, 1.0]; thread = true)
+end