From 8c8762c7764ed8eb932f1433a93546e661aee0b3 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Sat, 30 May 2026 19:40:08 -0400 Subject: [PATCH 1/3] =?UTF-8?q?Add=20lstsq:=20minimum-norm=20least=20squar?= =?UTF-8?q?es=20(A=E2=81=BAb)=20for=20rank-deficient=20/=20inconsistent=20?= =?UTF-8?q?systems?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `lstsq(A, b)` returns the Moore–Penrose minimum-norm least-squares solution, correct for rank-deficient or inconsistent A where `\`/`factorize`/`qr` throw SingularException. A separate, opt-in entry point — the direct solves stay exact and keep throwing. Why not the bordered/Woodbury path: solving [S U; V -I][x;y]=[b;0] in least squares minimizes a different objective than ‖Ax−b‖ (measured residual 3–65× too large on rank-deficient inconsistent systems), and there is no Woodbury formula for the pseudoinverse of a sum. So this works on A directly. Two engines: - :dense (default) — LAPACK gelsy (complete-orthogonal decomposition) of the densified A. Exact (matches pinv(Matrix(A))*b to machine precision), O(n³)/O(n²). A = S+UV is always fully dense, so a direct solve cannot exploit the sparsity. - :iterative — LSQR (default) / LSMR driven by A's structured matvec and adjoint, never forming the dense A; for large n. Converges to the same min-norm solution from x0=0. Lives in an IterativeSolvers extension (weakdep). Also adds a structured adjoint/transpose matvec to core (Aᴴu = Sᴴu + Vᴴ(Uᴴu)) — a real fix, since A'*u previously fell back to a ~1000× slower generic getindex path. This lets the iterative engine drive A directly (no LinearMaps/LinearOperators dependency). Engine choice: IterativeSolvers, not Krylov/LinearSolve's KrylovJL_LSMR. Measured: with a correct adjoint, IterativeSolvers lsqr/lsmr reach ~1e-12 min-norm accuracy while Krylov lsmr/lsqr halt early at ~1e-6 (and tightening tolerances does not help). Both converge — Krylov is not "broken" — but IterativeSolvers is ~6 orders tighter, which matters here. Adversarial review found and fixed: (1) lsqr's `isconverged` is unreliable (reports success at the iteration cap with a wrong answer) — now flag on budget exhaustion and report the true LS residual; (2) negative λ silently treated as |λ| by the dense path — now rejected; (3) Int/Rational slipped the eltype guard (checked the float-promoted type) — now checks the raw promoted type. Regression tests for each. Stacked on #4 (the QR PR): test_lstsq.jl asserts qr(A) throws SingularException while lstsq succeeds, so it depends on the QR entry point. 390/390 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean. Co-Authored-By: Chris Rackauckas --- Project.toml | 6 +- README.md | 42 ++++- ...hDenseRowColMatricesIterativeSolversExt.jl | 53 +++++++ src/SparseWithDenseRowColMatrices.jl | 3 +- src/lstsq.jl | 100 ++++++++++++ src/matvec.jl | 54 +++++++ test/runtests.jl | 3 + test/test_lstsq.jl | 149 ++++++++++++++++++ 8 files changed, 407 insertions(+), 3 deletions(-) create mode 100644 ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl create mode 100644 src/lstsq.jl create mode 100644 test/test_lstsq.jl diff --git a/Project.toml b/Project.toml index f86302c..ffaa145 100644 --- a/Project.toml +++ b/Project.toml @@ -10,14 +10,17 @@ PureKLU = "0c0d3e7f-3a8b-4f7e-b6f1-9a4d2e7c1f01" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [weakdeps] +IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" [extensions] +SparseWithDenseRowColMatricesIterativeSolversExt = "IterativeSolvers" SparseWithDenseRowColMatricesLinearSolveExt = "LinearSolve" [compat] Aqua = "0.8" ForwardDiff = "0.10, 1" +IterativeSolvers = "0.9" LinearAlgebra = "1" LinearSolve = "3" PrecompileTools = "1" @@ -31,6 +34,7 @@ julia = "1.10" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -39,4 +43,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Aqua", "ForwardDiff", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"] +test = ["Aqua", "ForwardDiff", "IterativeSolvers", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"] diff --git a/README.md b/README.md index 9f249be..7db9c3c 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,42 @@ refactor since it has no symbolic reuse) and far slower than the Woodbury fast p error is comparable (~`1e-13`) for all three on a well-conditioned `A`. Use `qr` for the factorization/rank-revealing semantics, not for speed. +## Rank-deficient / inconsistent least squares (`lstsq`) + +`\`, `factorize`/`lu`, and `qr` solve **nonsingular** systems and throw `SingularException` +when `A` is singular. For a genuinely rank-deficient or inconsistent system, `lstsq(A, b)` +returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Penrose solution: of +all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`). + +```julia +x = lstsq(A, b) # exact, dense COD (default) +x = lstsq(A, b; alg = :iterative) # structure-exploiting LSQR/LSMR (needs `using IterativeSolvers`) +x = lstsq(A, b; λ = 1e-3) # Tikhonov-damped (ridge) variant +``` + +This is a **separate, opt-in** entry point — `\`/`factorize`/`qr` stay exact and keep throwing +on singular `A`; least squares never happens implicitly. The bordered/Woodbury machinery +*cannot* produce `A⁺b`: solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a different +objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent +systems), and there is no Woodbury-style formula for the pseudoinverse of a sum. + +Two engines: + +* **`:dense`** (default) — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified + `A`. Exact (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` time / `O(n²)` memory. + Because `A = S + U·V` is **always fully dense** (the rank-`r` outer product fills every + entry), a direct solve genuinely cannot exploit the sparsity — so this is right up to `n` of + a few thousand. +* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint + (`A*x = S*x + U(Vx)`, `Aᴴu = Sᴴu + Vᴴ(Uᴴu)`), so the dense `A` is **never formed** — for `n` + too large to densify. Started from `x0 = 0` it converges to the same minimum-norm solution. + It is approximate (to `atol`/`btol`) and fragile under ill-conditioning; if it exhausts its + `maxiters` budget it warns and reports the achieved least-squares residual. Lives in a package + extension (`using IterativeSolvers`). + +Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. Adjoint/transpose matvec +(`A'*u`, `Aᴴ`, `Aᵀ`) is also useful on its own and is computed by the same structured kernel. + ## Benchmarks A banded sparse interior plus `r = 8` dense rows (`n = 5000`). Factoring only `S` and @@ -194,9 +230,10 @@ globally-scattered `S` degrades every sparse method.) * [x] Matrix–vector / matrix–matrix multiply (`mul!` vector path is allocation-free) * [x] Factorization and linear solve (`factorize`/`lu`, `\`, `ldiv!`) * [x] In-place refactorization with a fixed sparsity pattern (`refactor!`) -* [x] Adjoint / transpose solves +* [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) * [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury path * [x] Rank-revealing QR (`qr`) for ill-conditioned `S` (`Float64`/`ComplexF64` only) +* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` `qr(A)` builds the augmented sparse-QR factorization (above). It is the stability path, not the throughput path: no symbolic-reuse refactor, no zero-allocation solve, and `Float64` @@ -317,12 +354,15 @@ augmented mode exists, and augmented QR needs no refinement); singular `A` likew SparseWithDenseRowColMatrix SelectorMatrix sparsepart fillpart lowrankfactors exclusive_sparsepart denserank factorize lu qr \\ ldiv! refactor! lu! qr! update_lowrank! +lstsq # minimum-norm least squares (rank-deficient / inconsistent) SparseWithDenseRowColWoodbury SparseWithDenseRowColAugmented SparseWithDenseRowColQRAugmented recommend_lowrank_peel PeelRecommendation SparseWithDenseRowColFactorization # LinearSolve.jl algorithm (LU/Woodbury, default) SparseWithDenseRowColQRFactorization # LinearSolve.jl algorithm (stable QR, opt-in) ``` +The `:iterative` engine of `lstsq` requires `using IterativeSolvers` (a package extension). + ## Installation PureKLU.jl is not yet in the General registry; install it from the SciML repository: diff --git a/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl new file mode 100644 index 0000000..6441f10 --- /dev/null +++ b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl @@ -0,0 +1,53 @@ +module SparseWithDenseRowColMatricesIterativeSolversExt + +using SparseWithDenseRowColMatrices +using SparseWithDenseRowColMatrices: SparseWithDenseRowColMatrix, _check_lstsq_eltype +using IterativeSolvers +using LinearAlgebra: LinearAlgebra + +# Iterative engine for `lstsq(A, b; alg=:iterative)`. LSQR (default) / LSMR are driven entirely +# by A's structured matvec and adjoint (`mul!(y, A, x)` / `mul!(y, A', x)` defined in +# src/matvec.jl), so the dense `A = S + U·V` is never formed. Started from x0 = 0, both +# converge to the MINIMUM-NORM least-squares solution for rank-deficient / inconsistent systems +# (the min-norm guarantee is contingent on the zero start — the solver is never given a guess). +function SparseWithDenseRowColMatrices._lstsq_iterative( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + solver::Symbol = :lsqr, atol::Real = 1.0e-12, btol::Real = 1.0e-12, + maxiters::Integer = 20 * size(A, 1), λ::Real = 0, verbose::Bool = false, + # accept and ignore the dense-only keyword so `alg` can be flipped without edits + rcond = nothing, + ) + n = size(A, 1) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + _check_lstsq_eltype(promote_type(eltype(A), eltype(b))) # reject non-BLAS-float up front + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + + x, ch = if solver === :lsqr + IterativeSolvers.lsqr(A, b; damp = λ, atol = atol, btol = btol, maxiter = maxiters, log = true) + elseif solver === :lsmr + IterativeSolvers.lsmr(A, b; λ = λ, atol = atol, btol = btol, maxiter = maxiters, log = true) + else + throw(ArgumentError("solver must be :lsqr or :lsmr; got :$solver")) + end + + # IterativeSolvers' `isconverged` is unreliable here (lsqr reports success even when it + # stops at the iteration cap with a wrong answer), so flag on budget exhaustion — which + # cleanly separates converged from non-converged for both lsqr and lsmr — and report the + # true least-squares optimality ‖Aᴴ(Ax−b)‖ (one cheap structured adjoint matvec; →0 at a + # minimizer) rather than trusting the solver's own flag. + capped = ch.iters ≥ maxiters + verbose && @info "lstsq(alg=:iterative)" solver converged = !capped iters = ch.iters + if capped + aresid = LinearAlgebra.norm(A' * (A * x .- b)) + @warn( + "lstsq(...; alg=:iterative, solver=:$solver) hit the $maxiters-iteration budget " * + "without converging — the result is NOT a reliable least-squares solution. " * + "Raise `maxiters`, or use the exact `alg=:dense` if `A` is small enough to densify.", + least_squares_residual = aresid, + iters = ch.iters, + ) + end + return x +end + +end # module diff --git a/src/SparseWithDenseRowColMatrices.jl b/src/SparseWithDenseRowColMatrices.jl index 911c33b..2aa2dfc 100644 --- a/src/SparseWithDenseRowColMatrices.jl +++ b/src/SparseWithDenseRowColMatrices.jl @@ -17,12 +17,13 @@ include("woodbury.jl") include("augmented.jl") include("qr.jl") include("factorize.jl") +include("lstsq.jl") include("detect.jl") export SparseWithDenseRowColMatrix, SelectorMatrix, sparsepart, fillpart, lowrankfactors, exclusive_sparsepart, denserank, SparseWithDenseRowColWoodbury, SparseWithDenseRowColAugmented, - SparseWithDenseRowColQRAugmented, refactor!, update_lowrank!, + SparseWithDenseRowColQRAugmented, refactor!, update_lowrank!, lstsq, recommend_lowrank_peel, PeelRecommendation, SparseWithDenseRowColFactorization, SparseWithDenseRowColQRFactorization diff --git a/src/lstsq.jl b/src/lstsq.jl new file mode 100644 index 0000000..7b8aade --- /dev/null +++ b/src/lstsq.jl @@ -0,0 +1,100 @@ +# ------------------ +# Rank-deficient / inconsistent least squares: minimum-norm solution x = A⁺b +# ------------------ +# +# `\`, `factorize`/`lu`, and `qr` solve NONSINGULAR systems and throw on a singular `A`. For a +# genuinely rank-deficient or inconsistent system one wants the Moore–Penrose solution +# x = A⁺b (the minimum-2-norm vector among the minimizers of ‖Ax−b‖). The bordered/Woodbury +# machinery CANNOT produce it — solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a +# different objective than `‖Ax−b‖`, and there is no Woodbury-style formula for the +# pseudoinverse of a sum. So this is a separate, opt-in entry point. +# +# Two engines: +# :dense (default) — complete-orthogonal decomposition (LAPACK gelsy) of the densified A. +# Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory. Since +# A = S + U·V is always fully dense (the rank-r outer product fills +# every entry), there is no sparsity to exploit in a direct solve. +# :iterative — LSQR/LSMR (IterativeSolvers) driven by A's structured matvec/adjoint, +# never forming the dense A. For n too large to densify. Approximate to +# a tolerance, fragile under ill-conditioning. Lives in an extension +# (`using IterativeSolvers`). + +_lstsq_supported(::Type{<:Union{Float32, Float64, ComplexF32, ComplexF64}}) = true +_lstsq_supported(::Type) = false + +function _check_lstsq_eltype(::Type{T}) where {T} + _lstsq_supported(T) || throw( + ArgumentError( + "lstsq supports only Float32/Float64/ComplexF32/ComplexF64 (it uses LAPACK / a BLAS-float " * + "iterative solver); got eltype $T." + ) + ) + return nothing +end + +# Iterative engine — method added by the IterativeSolvers extension; no method in core. +function _lstsq_iterative end + +""" + lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:dense, kwargs...) -> x + +Minimum-norm least-squares solution `x ≈ A⁺b` (the Moore–Penrose pseudoinverse solution), +correct for **rank-deficient and/or inconsistent** systems where `\\`/`factorize`/`qr` would +throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`. + +* `alg = :dense` (default): complete-orthogonal decomposition (LAPACK `gelsy`) of the densified + `A`. Exact, `O(n³)` time and `O(n²)` memory. `A = S + U·V` is always fully dense, so a direct + solve cannot exploit the sparsity; this is the right choice up to `n` of a few thousand. +* `alg = :iterative`: LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint, + never forming the dense `A` — for `n` too large to densify. Requires `using IterativeSolvers`. + Approximate to a tolerance and fragile under ill-conditioning; see the keyword options. + +Keywords: `rcond` (`:dense` rank cutoff), `solver` (`:lsqr`/`:lsmr`), `atol`, `btol`, +`maxiters`, `λ` (Tikhonov damping; `0` = pure minimum-norm), `verbose` (`:iterative`). +Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. + +`\\`, `factorize`/`lu`, and `qr` are unchanged — they stay exact and throw on singular `A`; +least squares is always an explicit opt-in via `lstsq`. +""" +function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :dense, kwargs...) + if alg === :dense + return _lstsq_dense(A, b; kwargs...) + elseif alg === :iterative + isempty(methods(_lstsq_iterative)) && throw( + ArgumentError( + "lstsq(...; alg=:iterative) requires `using IterativeSolvers` (the iterative engine " * + "lives in a package extension)." + ) + ) + return _lstsq_iterative(A, b; kwargs...) + else + throw(ArgumentError("alg must be :dense or :iterative; got :$alg")) + end +end + +function _lstsq_dense( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + rcond::Real = -one(real(float(eltype(A)))), λ::Real = 0, + # accept and ignore iterative-only keywords so `alg` can be flipped without edits + solver = nothing, atol = nothing, btol = nothing, maxiters = nothing, verbose = nothing, + ) + n = size(A, 1) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + _check_lstsq_eltype(promote_type(eltype(A), eltype(b))) # reject non-BLAS-float (e.g. Int) up front + TT = float(promote_type(eltype(A), eltype(b))) + rc = rcond < 0 ? eps(real(TT)) * n : real(TT)(rcond) + M = Matrix(A) + Md = eltype(M) === TT ? copy(M) : TT.(M) + if iszero(λ) + rhs = collect(TT, b) + x, _ = LinearAlgebra.LAPACK.gelsy!(Md, rhs, rc) + return x + else + # Tikhonov: minimize ‖A x − b‖² + λ²‖x‖² = ‖[A; λI] x − [b; 0]‖² + stacked = vcat(Md, real(TT)(λ) * Matrix{TT}(LinearAlgebra.I, n, n)) + rhs = vcat(collect(TT, b), zeros(TT, n)) + x, _ = LinearAlgebra.LAPACK.gelsy!(stacked, rhs, rc) + return x[1:n] + end +end diff --git a/src/matvec.jl b/src/matvec.jl index 9359f9a..b38a0fe 100644 --- a/src/matvec.jl +++ b/src/matvec.jl @@ -115,3 +115,57 @@ function Base.:*(A::SparseWithDenseRowColMatrix, X::AbstractMatrix) Y = Matrix{T}(undef, size(A, 1), size(X, 2)) return mul!(Y, A, X, one(T), zero(T)) end + +# ------------------ +# Adjoint / transpose matvec: Aᴴu = Sᴴu + Vᴴ(Uᴴu) (and Aᵀ with no conjugation) +# ------------------ +# Without these, `mul!(y, A', u)` / `A'*u` fall back to LinearAlgebra's generic `getindex` +# path, which materializes columns of `A` and is ~1000× slower than the structured product — +# crippling for an iterative least-squares solve, whose every step needs an adjoint matvec. + +# w = Uᴴu (length r). Selector U = [I_r; 0] ⇒ (Uᴴu)_k = u_k. +function _lowrank_adjU!(w, U::AbstractMatrix, u, cnj) + @inbounds for k in axes(U, 2) + s = zero(eltype(w)) + for i in axes(U, 1) + s += cnj(U[i, k]) * u[i] + end + w[k] = s + end + return w +end +function _lowrank_adjU!(w, U::SelectorMatrix, u, cnj) + @inbounds for k in 1:U.r + w[k] = u[k] + end + return w +end + +# `Sop` is `adjoint`/`transpose`, `cnj` is `conj`/`identity` (for the dense U/V blocks). +function _adjoint_matvec!( + y::AbstractVector, A::SparseWithDenseRowColMatrix, u::AbstractVector, + α::Number, β::Number, Sop, cnj + ) + mul!(y, Sop(A.S), u, α, β) # y .= α Sᴴu + β y + U, V = A.U, A.V + r = size(V, 1) + w = Vector{promote_type(eltype(A), eltype(u))}(undef, r) + _lowrank_adjU!(w, U, u, cnj) # w = Uᴴu (length r) + @inbounds for k in 1:r + wk = α * w[k] + for j in axes(V, 2) + y[j] += cnj(V[k, j]) * wk # y .+= α Vᴴw + end + end + return y +end + +for (Wrap, Sop, cnj) in ((:Adjoint, :adjoint, :conj), (:Transpose, :transpose, :identity)) + @eval LinearAlgebra.mul!( + y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector, + α::Number, β::Number, + ) = _adjoint_matvec!(y, parent(wA), u, α, β, $Sop, $cnj) + @eval LinearAlgebra.mul!( + y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector, + ) = mul!(y, wA, u, true, false) +end diff --git a/test/runtests.jl b/test/runtests.jl index 78949fb..e2abce2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -16,6 +16,9 @@ using SafeTestsets, Test @safetestset "QR factorization (augmented, stable)" begin include("test_qr.jl") end + @safetestset "Rank-deficient least squares (min-norm A⁺b)" begin + include("test_lstsq.jl") + end @safetestset "Refactorization (Newton path)" begin include("test_refactor.jl") end diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl new file mode 100644 index 0000000..5e1e0b8 --- /dev/null +++ b/test/test_lstsq.jl @@ -0,0 +1,149 @@ +include("testutils.jl") +using Test, LinearAlgebra, SparseArrays +import IterativeSolvers + +# A = S + U V with EXACTLY rank n-1: zero column/row n of S and zero column n of V make column +# n of A identically zero, so A is genuinely rank-deficient while keeping the S+UV structure. +function rankdef(n, r; seed = 1, T = Float64, selector = false) + rng = MersenneTwister(seed) + S = spdiagm(0 => fill(T(4), n), 1 => fill(T(-1), n - 1), -1 => fill(T(-1), n - 1)) + S[:, n] .= zero(T); S[n, :] .= zero(T); dropzeros!(S) + U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r)) + V = T.(randn(rng, r, n)); V[:, n] .= zero(T) + return SparseWithDenseRowColMatrix(S, U, V) +end + +minnorm(M, b) = pinv(M) * b # the oracle: minimum-norm least-squares solution A⁺b + +@testset "adjoint / transpose matvec (core, structured Sᴴu + Vᴴ(Uᴴu))" begin + for (lbl, A) in ( + ("dense-U", rankdef(70, 3; seed = 1)), + ("selector-U", rankdef(70, 3; seed = 2, selector = true)), + ) + M = Matrix(A); u = randn(70) + @test A' * u ≈ M' * u + @test transpose(A) * u ≈ transpose(M) * u + @test mul!(similar(u), A', u) ≈ M' * u + y = randn(70); ref = 2.0 .* (M' * u) .+ 3.0 .* y # 5-arg in-place + mul!(y, A', u, 2.0, 3.0) + @test y ≈ ref + end + # complex: adjoint (conjugating) ≠ transpose + rng = MersenneTwister(9); n, r = 60, 2 + S = spdiagm(0 => fill(ComplexF64(4), n), 1 => fill(ComplexF64(-1), n - 1), -1 => fill(ComplexF64(-1), n - 1)) + Ac = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), randn(rng, ComplexF64, r, n)) + Mc = Matrix(Ac); u = randn(ComplexF64, n) + @test Ac' * u ≈ Mc' * u + @test transpose(Ac) * u ≈ transpose(Mc) * u + @test !(Ac' * u ≈ transpose(Ac) * u) # genuinely different for complex +end + +@testset "dense engine: minimum-norm LS on rank-deficient A (vs pinv oracle)" begin + for seed in 1:4 + A = rankdef(80, 3; seed = seed) + M = Matrix(A) + @test rank(M) == 79 # genuinely rank-deficient + N = nullspace(M) # n × 1 here + for (kind, b) in ( + (:consistent, M * randn(80)), + (:inconsistent, randn(80)), + ) + xref = minnorm(M, b) + x = lstsq(A, b) # alg = :dense default + @test relerr(x, xref) < 1.0e-10 + @test norm(x) ≈ norm(xref) rtol = 1.0e-9 # minimum-norm + @test norm(M' * (M * x - b)) < 1.0e-8 * norm(M) # least-squares optimality + @test norm(N' * x) < 1.0e-9 # no null-space component + kind === :consistent && @test norm(M * x - b) < 1.0e-9 + end + end +end + +@testset "iterative engine recovers the same minimum-norm solution" begin + for seed in 1:3, solver in (:lsqr, :lsmr) + A = rankdef(80, 3; seed = seed) + M = Matrix(A) + b = seed == 1 ? M * randn(80) : randn(80) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :iterative, solver = solver, atol = 1.0e-13, btol = 1.0e-13) + @test relerr(x, xref) < 1.0e-6 + @test norm(x) ≈ norm(xref) rtol = 1.0e-5 + @test relerr(x, lstsq(A, b)) < 1.0e-6 # agrees with the dense engine + end +end + +@testset "SelectorMatrix-U (boundary-condition form)" begin + A = rankdef(80, 2; seed = 5, selector = true) + M = Matrix(A); b = randn(80) + xref = minnorm(M, b) + @test relerr(lstsq(A, b), xref) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), xref) < 1.0e-6 +end + +@testset "full-rank square reduces to the ordinary solve" begin + A = rand_sparsedense(60, 3; seed = 11) + M = Matrix(A); b = randn(60) + @test relerr(lstsq(A, b), M \ b) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), M \ b) < 1.0e-6 +end + +@testset "complex rank-deficient LS" begin + rng = MersenneTwister(21); n, r = 70, 2 + S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1)) + S[:, n] .= 0; S[n, :] .= 0; dropzeros!(S) + V = randn(rng, ComplexF64, r, n); V[:, n] .= 0 + A = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), V) + M = Matrix(A); b = randn(ComplexF64, n) + @test rank(M) == n - 1 + @test relerr(lstsq(A, b), minnorm(M, b)) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), minnorm(M, b)) < 1.0e-6 +end + +@testset "Tikhonov damping λ > 0 (both engines)" begin + A = rankdef(60, 2; seed = 7) + M = Matrix(A); b = randn(60); λ = 0.1 + xref = (M' * M + λ^2 * I) \ (M' * b) # the damped normal-equations solution + @test relerr(lstsq(A, b; λ = λ), xref) < 1.0e-9 + @test relerr(lstsq(A, b; alg = :iterative, λ = λ, atol = 1.0e-13, btol = 1.0e-13), xref) < 1.0e-6 +end + +@testset "lstsq is a separate opt-in: \\ / factorize / qr still throw on singular A" begin + A = rankdef(50, 2; seed = 3) + b = randn(50) + @test_throws SingularException factorize(A; strategy = :augmented) + @test_throws SingularException qr(A) + @test_throws SingularException (A \ b) + @test relerr(lstsq(A, b), minnorm(Matrix(A), b)) < 1.0e-10 # but lstsq succeeds +end + +@testset "eltype guard and argument validation" begin + A = rankdef(20, 2; seed = 1, T = BigFloat) # gelsy / BLAS-iterative cannot do BigFloat + @test_throws ArgumentError lstsq(A, randn(BigFloat, 20)) + # non-BLAS-float (Int) must be rejected, not silently promoted to a Float64 answer + Si = spdiagm(0 => fill(4, 20), 1 => fill(-1, 19), -1 => fill(-1, 19)) + Ui = zeros(Int, 20, 2); Ui[1, 1] = 1; Ui[2, 2] = 1 + Vi = zeros(Int, 2, 20); Vi[1, 1] = 1; Vi[2, 2] = 1 + Ai = SparseWithDenseRowColMatrix(Si, Ui, Vi) + @test_throws ArgumentError lstsq(Ai, ones(Int, 20)) + @test_throws ArgumentError lstsq(Ai, ones(Int, 20); alg = :iterative) + + Af = rankdef(20, 2; seed = 1) + bf = randn(20) + @test_throws ArgumentError lstsq(Af, bf; alg = :bogus) + @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, solver = :bogus) + @test_throws DimensionMismatch lstsq(Af, randn(19)) + # negative Tikhonov damping rejected by BOTH engines (dense must not silently use |λ|) + @test_throws ArgumentError lstsq(Af, bf; λ = -1.0) + @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, λ = -1.0) +end + +@testset "iterative non-convergence warns (isconverged is unreliable for lsqr)" begin + A = rankdef(60, 3; seed = 4) + b = randn(60) + # a far-too-small budget must WARN for both solvers (the bug was lsqr silently returning a + # wrong answer with isconverged=true; we now flag on budget exhaustion) + @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsqr, maxiters = 2) + @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsmr, maxiters = 2) + # an ample budget converges, so the result is accurate (and the warn branch is not taken) + @test relerr(lstsq(A, b; alg = :iterative, maxiters = 20 * 60), minnorm(Matrix(A), b)) < 1.0e-6 +end From 699afb6a460ef14cf5bbfe4e17b80b341d262a5d Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Sun, 31 May 2026 07:54:06 -0400 Subject: [PATCH 2/3] lstsq: add a structure-exploiting DIRECT engine (capacitance) + make :auto the default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `alg=:structured` and makes `alg=:auto` the default for `lstsq`. When S is nonsingular, A = S(I + ZV) with Z = S⁻¹U, and the entire rank deficiency collapses into the small r×r capacitance C = I + V Z (nullity(A) = nullity(C), since det A = det S · det C). So A⁺b is assembled from ONE sparse factorization of S (PureKLU) + r solves for Z + small dense SVD/QR on r×r and n×s (s ≤ 2r) blocks — A is NEVER formed or factored densely. O(factor(S) + n·r²); measured 191× faster than the dense COD at n=2000, r=6. Handles consistent AND inconsistent b (a rank-k projection of b onto range(A) via the left-null basis S⁻ᴴVᴴnull(Cᴴ)). :auto uses :structured when S is nonsingular & well-conditioned, else the exact :dense COD. The structured route inverts S, so it is guarded two ways and falls back on failure: (1) klu throws on exactly-singular S; (2) near-singular S is caught by a κ̂(S)=‖S‖₁‖Z‖/‖U‖ estimate (klu does NOT throw there — it silently returns garbage Z, verified) plus a post-hoc least-squares-optimality check. `alg=:structured` (forced) errors rather than return a wrong answer; `alg=:auto` falls back to :dense. Every formula was derived and verified against the pinv(Matrix(A))*b oracle before implementation (consistent/inconsistent × rank deficiency k=1,2,3 × dense-U/SelectorMatrix-U × complex): relerr ~1e-15, min-norm + LS-optimality + null-space-free certificates. The inconsistent-case projection is essential — the naive M⁺S⁻¹b shortcut is 23% wrong on inconsistent b (the LS objective ‖Ax−b‖=‖S(Mx−S⁻¹b)‖ is S-weighted). 581/581 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean. No new deps (reuses the existing PureKLU dependency for the S factorization). Co-Authored-By: Chris Rackauckas --- README.md | 51 +++++++++----- src/lstsq.jl | 168 +++++++++++++++++++++++++++++++++++++++------ test/test_lstsq.jl | 79 ++++++++++++++++++++- 3 files changed, 256 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 7db9c3c..d6ec5f5 100644 --- a/README.md +++ b/README.md @@ -170,8 +170,10 @@ returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Pen all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`). ```julia -x = lstsq(A, b) # exact, dense COD (default) -x = lstsq(A, b; alg = :iterative) # structure-exploiting LSQR/LSMR (needs `using IterativeSolvers`) +x = lstsq(A, b) # :auto — structured direct, dense fallback (default) +x = lstsq(A, b; alg = :structured) # structure-exploiting direct solve (never densifies A) +x = lstsq(A, b; alg = :dense) # exact dense complete-orthogonal decomposition (gelsy) +x = lstsq(A, b; alg = :iterative) # LSQR/LSMR (needs `using IterativeSolvers`) x = lstsq(A, b; λ = 1e-3) # Tikhonov-damped (ridge) variant ``` @@ -181,22 +183,31 @@ on singular `A`; least squares never happens implicitly. The bordered/Woodbury m objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent systems), and there is no Woodbury-style formula for the pseudoinverse of a sum. -Two engines: - -* **`:dense`** (default) — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified - `A`. Exact (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` time / `O(n²)` memory. - Because `A = S + U·V` is **always fully dense** (the rank-`r` outer product fills every - entry), a direct solve genuinely cannot exploit the sparsity — so this is right up to `n` of - a few thousand. -* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint - (`A*x = S*x + U(Vx)`, `Aᴴu = Sᴴu + Vᴴ(Uᴴu)`), so the dense `A` is **never formed** — for `n` - too large to densify. Started from `x0 = 0` it converges to the same minimum-norm solution. - It is approximate (to `atol`/`btol`) and fragile under ill-conditioning; if it exhausts its - `maxiters` budget it warns and reports the achieved least-squares residual. Lives in a package - extension (`using IterativeSolvers`). - -Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. Adjoint/transpose matvec -(`A'*u`, `Aᴴ`, `Aᵀ`) is also useful on its own and is computed by the same structured kernel. +Engines (`:auto` picks the first that applies): + +* **`:structured`** — a **direct** solve that **never densifies `A`**. When `S` is nonsingular, + `A = S(I + ZV)` with `Z = S⁻¹U`, and the *entire* rank deficiency collapses into the small + `r × r` capacitance `C = I + V Z` (`nullity(A) = nullity(C)`, since `det A = det S · det C`). + So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) + + small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100× + faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles + consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). Requires `S` + nonsingular and well-conditioned — it errors otherwise (where inverting `S` would be silently + wrong), so `:auto` falls back to `:dense` there. +* **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact + (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback + when `S` is singular/near-singular (the structured route inverts `S`, so it can't apply there). +* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never + forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor + `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution; + approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence. + Lives in an extension (`using IterativeSolvers`). + +`:auto` (the default) uses `:structured` when `S` is nonsingular and well-conditioned, +otherwise the exact `:dense` COD — so you get the fast structure-exploiting direct solve when it +is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are +supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured +kernel and is useful on its own. ## Benchmarks @@ -233,7 +244,9 @@ globally-scattered `S` degrades every sparse method.) * [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) * [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury path * [x] Rank-revealing QR (`qr`) for ill-conditioned `S` (`Float64`/`ComplexF64` only) -* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` +* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` — a + structure-exploiting **direct** solve (capacitance method) when `S` is nonsingular, >100× + faster than densifying; exact dense COD fallback otherwise `qr(A)` builds the augmented sparse-QR factorization (above). It is the stability path, not the throughput path: no symbolic-reuse refactor, no zero-allocation solve, and `Float64` diff --git a/src/lstsq.jl b/src/lstsq.jl index 7b8aade..719ae0d 100644 --- a/src/lstsq.jl +++ b/src/lstsq.jl @@ -9,15 +9,23 @@ # different objective than `‖Ax−b‖`, and there is no Woodbury-style formula for the # pseudoinverse of a sum. So this is a separate, opt-in entry point. # -# Two engines: -# :dense (default) — complete-orthogonal decomposition (LAPACK gelsy) of the densified A. -# Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory. Since -# A = S + U·V is always fully dense (the rank-r outer product fills -# every entry), there is no sparsity to exploit in a direct solve. +# Engines: +# :auto (default) — try :structured, fall back to :dense when S is singular/near-singular. +# :structured — DIRECT and structure-exploiting (no densifying). When S is nonsingular, +# A = S(I + ZV) with Z = S⁻¹U, and the entire rank deficiency collapses +# into the small r×r capacitance C = I + V Z (nullity(A) = nullity(C)). +# So A⁺b is built from ONE sparse factorization of S (PureKLU) + r solves +# (Z) + small dense SVD/QR on r×r and n×s (s ≤ 2r) blocks — never forming +# or factoring the dense A. O(factor(S) + n·r²); >100× faster than dense +# COD for large n. Requires S nonsingular & well-conditioned (guarded). +# :dense — complete-orthogonal decomposition (LAPACK gelsy) of the densified A. +# Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory; the +# mandatory fallback when S is singular/near-singular (where the +# structured route is silently wrong). # :iterative — LSQR/LSMR (IterativeSolvers) driven by A's structured matvec/adjoint, -# never forming the dense A. For n too large to densify. Approximate to -# a tolerance, fragile under ill-conditioning. Lives in an extension -# (`using IterativeSolvers`). +# never forming the dense A. For n too large to densify when S is also +# singular (so neither :structured nor :dense apply). Approximate to a +# tolerance, fragile under ill-conditioning. Lives in an extension. _lstsq_supported(::Type{<:Union{Float32, Float64, ComplexF32, ComplexF64}}) = true _lstsq_supported(::Type) = false @@ -36,28 +44,51 @@ end function _lstsq_iterative end """ - lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:dense, kwargs...) -> x + lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:auto, kwargs...) -> x Minimum-norm least-squares solution `x ≈ A⁺b` (the Moore–Penrose pseudoinverse solution), correct for **rank-deficient and/or inconsistent** systems where `\\`/`factorize`/`qr` would -throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`. +throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`. Both the +consistent and inconsistent cases are handled (no user flag). -* `alg = :dense` (default): complete-orthogonal decomposition (LAPACK `gelsy`) of the densified - `A`. Exact, `O(n³)` time and `O(n²)` memory. `A = S + U·V` is always fully dense, so a direct - solve cannot exploit the sparsity; this is the right choice up to `n` of a few thousand. -* `alg = :iterative`: LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint, - never forming the dense `A` — for `n` too large to densify. Requires `using IterativeSolvers`. - Approximate to a tolerance and fragile under ill-conditioning; see the keyword options. +* `alg = :auto` (default): use the structure-exploiting **direct** method when `S` is + nonsingular and well-conditioned, automatically falling back to `:dense` otherwise. +* `alg = :structured`: a **direct** solve that never densifies `A`. When `S` is nonsingular the + rank deficiency collapses into the small `r × r` capacitance `C = I + V S⁻¹U`, so `A⁺b` comes + from one sparse factorization of `S` (PureKLU) plus small dense work on `r × r` / `n × s` + (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, `>100×` faster than `:dense` for large `n`. Errors + if `S` is singular/near-singular (where the structured route would be silently wrong). +* `alg = :dense`: complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. + Exact, `O(n³)` / `O(n²)`; the mandatory fallback when `S` is singular. +* `alg = :iterative`: LSQR / LSMR via `A`'s structured matvec/adjoint, never densifying — for + `n` too large to densify *and* `S` singular. Requires `using IterativeSolvers`; approximate. -Keywords: `rcond` (`:dense` rank cutoff), `solver` (`:lsqr`/`:lsmr`), `atol`, `btol`, -`maxiters`, `λ` (Tikhonov damping; `0` = pure minimum-norm), `verbose` (`:iterative`). -Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. +Keywords: `tolC` (`:structured`/`:auto` rank cutoff for `C`), `rcond` (`:dense` rank cutoff), +`solver` (`:lsqr`/`:lsmr`), `atol`, `btol`, `maxiters`, `λ` (Tikhonov damping; `0` = pure +minimum-norm), `verbose`. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. `\\`, `factorize`/`lu`, and `qr` are unchanged — they stay exact and throw on singular `A`; least squares is always an explicit opt-in via `lstsq`. """ -function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :dense, kwargs...) - if alg === :dense +function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :auto, kwargs...) + if alg === :auto + # structured can't do Tikhonov; route λ>0 (and structured-unsafe S) to the exact dense COD + if iszero(get(kwargs, :λ, 0)) + x, ok = _lstsq_structured(A, b; kwargs...) + ok && return x + end + return _lstsq_dense(A, b; kwargs...) + elseif alg === :structured + x, ok = _lstsq_structured(A, b; kwargs...) + ok || throw( + ArgumentError( + "lstsq(...; alg=:structured) is unsafe for this A: S is singular or near-singular, " * + "so the structured route (which inverts S) would be silently wrong. Use alg=:dense " * + "(exact) or alg=:auto (auto-fallback)." + ) + ) + return x + elseif alg === :dense return _lstsq_dense(A, b; kwargs...) elseif alg === :iterative isempty(methods(_lstsq_iterative)) && throw( @@ -68,10 +99,103 @@ function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = ) return _lstsq_iterative(A, b; kwargs...) else - throw(ArgumentError("alg must be :dense or :iterative; got :$alg")) + throw(ArgumentError("alg must be :auto, :structured, :dense or :iterative; got :$alg")) end end +# Structure-exploiting DIRECT min-norm least-squares (returns `(x, ok)`; `ok=false` signals the +# caller to fall back to dense because S is singular/near-singular). With S nonsingular, +# A = S(I + ZV), Z = S⁻¹U; the rank deficiency lives in C = I + V Z (nullity(A)=nullity(C)). +# A⁺b is assembled from one PureKLU factorization of S + small dense SVD/QR — A is never formed. +function _lstsq_structured( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + tolC::Real = -one(real(float(eltype(A)))), λ::Real = 0, + # accept and ignore the other engines' keywords so `alg` can be flipped without edits + rcond = nothing, solver = nothing, atol = nothing, btol = nothing, + maxiters = nothing, verbose = nothing, + ) + n = size(A, 1) + r = denserank(A) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + iszero(λ) || throw(ArgumentError("lstsq(...; alg=:structured) does not support Tikhonov λ>0; use alg=:dense.")) + RT = promote_type(eltype(A), eltype(b)) + _check_lstsq_eltype(RT) + TT = float(RT) + rT = real(TT) + tol = tolC < 0 ? sqrt(eps(rT)) : rT(tolC) + + Sown = _own_sparse(TT, A.S) + Sfact = try + PureKLU.klu(Sown) + catch e + e isa SingularException || rethrow(e) + return (zeros(TT, n), false) # S exactly singular → fall back to dense + end + + if r == 0 # A = S nonsingular ⇒ the solution is S⁻¹b + x = collect(TT, b) + PureKLU.solve!(Sfact, x) + return (x, true) + end + + V = Matrix{TT}(A.V) # r × n + Z = Matrix{TT}(undef, n, r) + materialize_U!(Z, A.U) # Z = U + nrmU = norm(Z) + PureKLU.solve!(Sfact, Z) # Z = S⁻¹U + + # Guard: near-singular S makes Z = S⁻¹U huge/garbage and `klu` does NOT throw, which both + # crashes the SVDs and gives a silently-wrong answer. κ̂(S) = ‖S‖₁·‖Z‖/‖U‖ ≳ κ(S); bail early. + kappaS = nrmU > 0 ? opnorm(Sown, 1) * norm(Z) / nrmU : zero(rT) + (all(isfinite, Z) && isfinite(kappaS) && kappaS ≤ inv(sqrt(eps(rT)))) || + return (zeros(TT, n), false) + + Vadj = Matrix(V') # n × r (adjoint; conjugate for complex) + C = V * Z # r × r capacitance + @inbounds for i in 1:r + C[i, i] += one(TT) + end + Fc = svd(C) + kdef = count(<(tol * Fc.S[1]), Fc.S) # nullity(A) = nullity(C) + + # orthonormal basis W (n × s, s ≤ 2r) of the active subspace span([Z | Vᴴ]); M is the + # identity off this subspace, so Mᴴ-pseudoinverse work happens only on the small s × s block. + QB = qr(hcat(Z, Vadj), ColumnNorm()) + Rd = abs.(diag(QB.R)) + s = isempty(Rd) ? 0 : count(>(tol * maximum(Rd)), Rd) + W = Matrix(QB.Q)[:, 1:s] + Msmall = W' * (Z * (V * W)) # s × s + @inbounds for i in 1:s + Msmall[i, i] += one(TT) + end + Fs = svd(Msmall) + sp = [σ > tol * Fs.S[1] ? inv(σ) : zero(rT) for σ in Fs.S] # truncated pinv of Msmall + + btil = collect(TT, b) + if kdef > 0 # project b onto range(A) (no-op when full rank) + Nl = Fc.U[:, (end - kdef + 1):end] # null(Cᴴ) (r × kdef) + LrA = Vadj * Nl # n × kdef + PureKLU.solve!(Sfact', LrA) # left-null(A) = S⁻ᴴ Vᴴ null(Cᴴ) + QL = Matrix(qr(LrA).Q)[:, 1:kdef] + btil .-= QL * (QL' * btil) + end + PureKLU.solve!(Sfact, btil) # btil = S⁻¹ b̃ (= c) + Wtc = W' * btil + x = W * (Fs.V * (sp .* (Fs.U' * Wtc))) + (btil - W * Wtc) # x = Mᴴ⁺ c + + # post-hoc least-squares optimality check (cheap structured matvecs): catches a misjudged + # rank/nullity. ‖Aᴴ(Ax−b)‖ → 0 at a minimizer; a loose threshold separates a good solution + # (~1e-14 relative) from a grossly wrong one (~1). A false trip only falls `:auto` back to the + # exact dense path, so erring conservative is safe. + res = A * x + res .-= b + atr = A' * res + nrm_atr = norm(atr) + ok = nrm_atr ≤ 1.0e-6 * (norm(A' * collect(TT, b)) + nrm_atr) + return (x, ok) +end + function _lstsq_dense( A::SparseWithDenseRowColMatrix, b::AbstractVector; rcond::Real = -one(real(float(eltype(A)))), λ::Real = 0, diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl index 5e1e0b8..98b0f0a 100644 --- a/test/test_lstsq.jl +++ b/test/test_lstsq.jl @@ -15,6 +15,22 @@ end minnorm(M, b) = pinv(M) * b # the oracle: minimum-norm least-squares solution A⁺b +# A = S + U V with S NONSINGULAR and A rank n-k: choose V so the r×r capacitance C = I + V S⁻¹U +# has nullity k (det A = det S · det C). This is the regime the structured-direct engine targets. +function rankdef_nonsingS(n, r, k; seed = 1, T = Float64, selector = false) + rng = MersenneTwister(seed) + S = spdiagm(-1 => fill(T(-1), n - 1), 0 => fill(T(4), n), 1 => fill(T(-1), n - 1)) + U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r)) + Z = Matrix(S) \ Matrix(U) + Q = Matrix(qr(T <: Complex ? randn(rng, T, r, r) : randn(rng, r, r)).Q) + d = ones(T, r) + for i in 1:k + d[i] = zero(T) + end + V = T.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z)) + return SparseWithDenseRowColMatrix(S, U, V) +end + @testset "adjoint / transpose matvec (core, structured Sᴴu + Vᴴ(Uᴴu))" begin for (lbl, A) in ( ("dense-U", rankdef(70, 3; seed = 1)), @@ -49,7 +65,7 @@ end (:inconsistent, randn(80)), ) xref = minnorm(M, b) - x = lstsq(A, b) # alg = :dense default + x = lstsq(A, b; alg = :dense) @test relerr(x, xref) < 1.0e-10 @test norm(x) ≈ norm(xref) rtol = 1.0e-9 # minimum-norm @test norm(M' * (M * x - b)) < 1.0e-8 * norm(M) # least-squares optimality @@ -59,6 +75,67 @@ end end end +@testset "structured direct engine: min-norm LS without densifying (S nonsingular)" begin + for seed in 1:3, (n, r, k) in ((60, 3, 1), (90, 5, 2), (120, 8, 3)), sel in (false, true) + A = rankdef_nonsingS(n, r, k; seed = seed, selector = sel) + M = Matrix(A) + @test rank(M) == n - k + N = nullspace(M) + for (kind, b) in ((:consistent, M * randn(n)), (:inconsistent, randn(n))) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :structured) + @test relerr(x, xref) < 1.0e-9 + @test norm(x) ≈ norm(xref) rtol = 1.0e-8 # minimum-norm + @test norm(M' * (M * x - b)) < 1.0e-7 * norm(M) # least-squares optimality + @test norm(N' * x) < 1.0e-8 # no null-space component + kind === :consistent && @test norm(M * x - b) < 1.0e-8 + end + end +end + +@testset "structured engine: complex" begin + n, r, k = 70, 3, 1 + rng = MersenneTwister(5) + S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1), 1 => fill(ComplexF64(-1), n - 1)) + U = randn(rng, ComplexF64, n, r); Z = Matrix(S) \ U + Q = Matrix(qr(randn(rng, ComplexF64, r, r)).Q); d = ones(r); d[1] = 0 + V = ComplexF64.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z)) + A = SparseWithDenseRowColMatrix(S, U, V); M = Matrix(A) + b = randn(ComplexF64, n) + @test relerr(lstsq(A, b; alg = :structured), minnorm(M, b)) < 1.0e-9 +end + +@testset ":auto uses structured on nonsingular S, falls back on singular/near-singular S" begin + # nonsingular S: :auto path agrees with explicit :structured and :dense, all = pinv + A = rankdef_nonsingS(100, 4, 2; seed = 1) + M = Matrix(A); b = randn(100); xref = minnorm(M, b) + @test relerr(lstsq(A, b), xref) < 1.0e-9 # :auto default + @test relerr(lstsq(A, b; alg = :auto), xref) < 1.0e-9 + @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :structured) + @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :dense) + + # exactly-singular S (zeroed row): structured klu throws → :auto falls back to dense + Asing = rankdef(80, 3; seed = 2) + Msing = Matrix(Asing); bs = randn(80) + @test relerr(lstsq(Asing, bs; alg = :auto), minnorm(Msing, bs)) < 1.0e-9 + @test_throws ArgumentError lstsq(Asing, bs; alg = :structured) + + # near-singular S (κ(S) ≈ 5e13, but A well-conditioned): structured would be silently wrong; + # :auto must fall back to the exact dense COD and :structured must error (not return garbage) + n = 80; rng = MersenneTwister(3) + Snear = spdiagm(0 => collect(exp.(range(0, -log(4.7e13); length = n))), -1 => fill(-1.0e-3, n - 1)) + Anear = SparseWithDenseRowColMatrix(Snear, randn(rng, n, 2), randn(rng, 2, n)) + Mnear = Matrix(Anear); bn = randn(n) + @test cond(Mnear) < 1.0e8 # A itself is well-conditioned + @test relerr(lstsq(Anear, bn; alg = :auto), minnorm(Mnear, bn)) < 1.0e-7 + @test_throws ArgumentError lstsq(Anear, bn; alg = :structured) +end + +@testset "structured rejects Tikhonov λ" begin + A = rankdef_nonsingS(40, 2, 1; seed = 1) + @test_throws ArgumentError lstsq(A, randn(40); alg = :structured, λ = 0.1) +end + @testset "iterative engine recovers the same minimum-norm solution" begin for seed in 1:3, solver in (:lsqr, :lsmr) A = rankdef(80, 3; seed = seed) From 4abc4fc3347118dbd3fbe5e1474bc19c54764c68 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas-Claude Date: Sun, 31 May 2026 08:52:56 -0400 Subject: [PATCH 3/3] lstsq: peel a selector-singular S so the structured direct engine handles the BVP case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When S is singular because its boundary rows are zeroed and supplied by a SelectorMatrix U (the `replace=true` / almost-banded case), the structured engine previously fell back to the dense COD. Add a sparse "peel": A = S + UV = (S + U Uᴴ) + U(V − Uᴴ), and for a selector U the term U Uᴴ is just r diagonal entries, so S̃ = S + U Uᴴ stays SPARSE and is typically nonsingular (the zeroed rows get a unit diagonal back), with the rank unchanged. The capacitance method then applies to (S̃, U, V − Uᴴ) — a structure-exploiting DIRECT solve, no densifying. Verified against pinv(Matrix(A))*b: ~2-3e-15 for A nonsingular AND rank-deficient, consistent AND inconsistent b. A general singular S whose null space is NOT coordinate-aligned has a dense de-singularizing correction (no sparse peel exists), so it still falls back to the exact dense COD — a genuine mathematical limit, caught by `_peel_selector` returning nothing. 617/617 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean. Co-Authored-By: Chris Rackauckas --- README.md | 20 +++++++++++------ src/lstsq.jl | 54 +++++++++++++++++++++++++++++++++++++++------- test/test_lstsq.jl | 26 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d6ec5f5..e9c63d8 100644 --- a/README.md +++ b/README.md @@ -191,21 +191,27 @@ Engines (`:auto` picks the first that applies): So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) + small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100× faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles - consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). Requires `S` - nonsingular and well-conditioned — it errors otherwise (where inverting `S` would be silently - wrong), so `:auto` falls back to `:dense` there. + consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). A **singular + `S` with coordinate-aligned null structure** — the boundary-condition / `replace=true` case, + where the zeroed rows are supplied by a `SelectorMatrix` `U` — is handled by a sparse *peel* + `S̃ = S + U Uᴴ` (just `r` diagonal entries, so `S̃` stays sparse and is nonsingular) with + `A = S̃ + U(V − Uᴴ)`, keeping the direct structure-exploiting path. It errors only when `S` is + near-singular or singular with a *general* (dense) null space — where inverting `S` would be + silently wrong — so `:auto` falls back to `:dense` there. * **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback - when `S` is singular/near-singular (the structured route inverts `S`, so it can't apply there). + when `S` is near-singular, or singular with a null space that is *not* coordinate-aligned (so + the sparse peel does not apply and there is no structure-exploiting direct route — the + null-space correction would itself be dense). * **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution; approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence. Lives in an extension (`using IterativeSolvers`). -`:auto` (the default) uses `:structured` when `S` is nonsingular and well-conditioned, -otherwise the exact `:dense` COD — so you get the fast structure-exploiting direct solve when it -is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are +`:auto` (the default) uses `:structured` when `S` is nonsingular (or singular with a +coordinate-aligned null space) and well-conditioned, otherwise the exact `:dense` COD — so you +get the fast structure-exploiting direct solve when it is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured kernel and is useful on its own. diff --git a/src/lstsq.jl b/src/lstsq.jl index 719ae0d..61c282c 100644 --- a/src/lstsq.jl +++ b/src/lstsq.jl @@ -103,10 +103,51 @@ function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = end end +# Sparse "peel" for a SINGULAR S whose null structure is coordinate-aligned (the BVP / +# boundary-condition case: `replace=true` zeros the selector rows of S, U = [I_r; 0]). The +# identity A = S + U V = (S + U Uᴴ) + U(V − Uᴴ) re-splits A with S̃ = S + U Uᴴ; for a selector U, +# U Uᴴ is just `r` diagonal entries, so S̃ stays SPARSE and is typically nonsingular (the zeroed +# rows get a unit diagonal back), and the rank is unchanged. The structured method then applies +# to (S̃, U, V − Uᴴ). For a dense U, U Uᴴ is dense (S̃ dense) so no sparse peel exists — return +# `nothing` and let the caller fall back to the dense COD. +_peel_selector(S, U, V) = nothing +function _peel_selector(S::SparseMatrixCSC{T}, U::SelectorMatrix, V::AbstractMatrix{T}) where {T} + r = U.r + r == 0 && return nothing + n = size(S, 1) + Stilde = S + sparse(1:r, 1:r, ones(T, r), n, n) # S̃ = S + U Uᴴ (sparse) + Vtilde = copy(V) + @inbounds for k in 1:r + Vtilde[k, k] -= one(T) # Ṽ = V − Uᴴ + end + return (Stilde, Vtilde) +end + +# Factor S; on (exact) singularity try the sparse selector-peel and factor S̃ instead. Returns +# `(Sfact, S_effective, V_effective)` or `nothing` (caller falls back to the dense COD). +function _klu_or_peel(Sown::SparseMatrixCSC, U, Vmat) + try + return (PureKLU.klu(Sown), Sown, Vmat) + catch e + e isa SingularException || rethrow(e) + end + peeled = _peel_selector(Sown, U, Vmat) + peeled === nothing && return nothing + Stilde, Vtilde = peeled + try + return (PureKLU.klu(Stilde), Stilde, Vtilde) + catch e + e isa SingularException || rethrow(e) + return nothing # S̃ still singular → dense fallback + end +end + # Structure-exploiting DIRECT min-norm least-squares (returns `(x, ok)`; `ok=false` signals the # caller to fall back to dense because S is singular/near-singular). With S nonsingular, # A = S(I + ZV), Z = S⁻¹U; the rank deficiency lives in C = I + V Z (nullity(A)=nullity(C)). # A⁺b is assembled from one PureKLU factorization of S + small dense SVD/QR — A is never formed. +# A SINGULAR S with coordinate-aligned null structure (selector U) is first peeled to a sparse +# nonsingular S̃ (see `_peel_selector`); a general singular S falls back to the dense COD. function _lstsq_structured( A::SparseWithDenseRowColMatrix, b::AbstractVector; tolC::Real = -one(real(float(eltype(A)))), λ::Real = 0, @@ -126,12 +167,10 @@ function _lstsq_structured( tol = tolC < 0 ? sqrt(eps(rT)) : rT(tolC) Sown = _own_sparse(TT, A.S) - Sfact = try - PureKLU.klu(Sown) - catch e - e isa SingularException || rethrow(e) - return (zeros(TT, n), false) # S exactly singular → fall back to dense - end + Vmat = r > 0 ? Matrix{TT}(A.V) : Matrix{TT}(undef, 0, n) + fac = _klu_or_peel(Sown, A.U, Vmat) # factor S, or peel a selector-singular S to S̃ + fac === nothing && return (zeros(TT, n), false) # S (and any peel) singular → fall back to dense + Sfact, Sown, V = fac # S, Sown, V are the *effective* (possibly peeled) ones if r == 0 # A = S nonsingular ⇒ the solution is S⁻¹b x = collect(TT, b) @@ -139,11 +178,10 @@ function _lstsq_structured( return (x, true) end - V = Matrix{TT}(A.V) # r × n Z = Matrix{TT}(undef, n, r) materialize_U!(Z, A.U) # Z = U nrmU = norm(Z) - PureKLU.solve!(Sfact, Z) # Z = S⁻¹U + PureKLU.solve!(Sfact, Z) # Z = S̃⁻¹U # Guard: near-singular S makes Z = S⁻¹U huge/garbage and `klu` does NOT throw, which both # crashes the SVDs and gives a silently-wrong answer. κ̂(S) = ‖S‖₁·‖Z‖/‖U‖ ≳ κ(S); bail early. diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl index 98b0f0a..10312b4 100644 --- a/test/test_lstsq.jl +++ b/test/test_lstsq.jl @@ -105,6 +105,32 @@ end @test relerr(lstsq(A, b; alg = :structured), minnorm(M, b)) < 1.0e-9 end +@testset "structured engine peels a selector-singular S (BVP boundary case)" begin + # S has its boundary rows zeroed (singular); U is a SelectorMatrix supplying them. The sparse + # peel S̃ = S + U Uᴴ (r diagonal entries) keeps the structured DIRECT path — no dense fallback — + # and stays exact, for A nonsingular AND rank-deficient, consistent AND inconsistent. + for seed in 1:3, rde in (false, true) + n, r = 100, 3 + S = spdiagm(0 => fill(3.0, n), 1 => fill(-1.0, n - 1), -1 => fill(-1.0, n - 1)) + S[1:r, :] .= 0.0; dropzeros!(S) + F = zeros(r, n) + for i in 1:r + F[i, i] = 1.0; F[i, i + r] = 0.3 + end + rde && (F[r, :] .= F[1, :]) # make A itself rank-deficient + A = SparseWithDenseRowColMatrix(S, F; replace = true) + M = Matrix(A); N = nullspace(M) + @test rank(M) == n - (rde ? 1 : 0) + for (kind, b) in ((:cons, M * randn(n)), (:incons, randn(n))) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :structured) # must NOT throw — the peel succeeds + @test relerr(x, xref) < 1.0e-9 + @test norm(x) ≈ norm(xref) rtol = 1.0e-8 + isempty(N) || @test norm(N' * x) < 1.0e-8 + end + end +end + @testset ":auto uses structured on nonsingular S, falls back on singular/near-singular S" begin # nonsingular S: :auto path agrees with explicit :structured and :dense, all = pinv A = rankdef_nonsingS(100, 4, 2; seed = 1)