diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 4ec058a..1f7020e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -29,10 +29,10 @@ jobs: with: version: ${{ matrix.version }} - uses: julia-actions/cache@v2 - # PureKLU is not yet registered in General; develop it from the SciML repo so the - # dependency resolves on every Julia version (`[sources]` in Project.toml covers 1.11+, - # but this keeps 1.10 green too). Remove once PureKLU is registered. - - name: Develop unregistered PureKLU dependency - run: julia --color=yes --project=. -e 'using Pkg; Pkg.develop(url="https://github.com/SciML/PureKLU.jl")' + # PureKLU and SparseColumnPivotedQR are not yet registered in General; develop them from + # the SciML repos so the dependencies resolve on every Julia version. Remove each once it + # is registered. + - name: Develop unregistered SciML dependencies + run: julia --color=yes --project=. -e 'using Pkg; Pkg.develop([PackageSpec(url="https://github.com/SciML/PureKLU.jl"), PackageSpec(url="https://github.com/SciML/SparseColumnPivotedQR.jl")])' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 diff --git a/Project.toml b/Project.toml index f86302c..87d4af5 100644 --- a/Project.toml +++ b/Project.toml @@ -8,16 +8,21 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" PureKLU = "0c0d3e7f-3a8b-4f7e-b6f1-9a4d2e7c1f01" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +SparseColumnPivotedQR = "a57abbd0-fea5-4d57-96be-5e525945e8e4" +SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" [weakdeps] +IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" [extensions] +SparseWithDenseRowColMatricesIterativeSolversExt = "IterativeSolvers" SparseWithDenseRowColMatricesLinearSolveExt = "LinearSolve" [compat] Aqua = "0.8" ForwardDiff = "0.10, 1" +IterativeSolvers = "0.9" LinearAlgebra = "1" LinearSolve = "3" PrecompileTools = "1" @@ -25,12 +30,15 @@ PureKLU = "0.1" Random = "1" SafeTestsets = "0.1" SparseArrays = "1" +SparseColumnPivotedQR = "0.1" +SparseMatricesCSR = "0.6" Test = "1" julia = "1.10" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -39,4 +47,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Aqua", "ForwardDiff", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"] +test = ["Aqua", "ForwardDiff", "IterativeSolvers", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"] diff --git a/README.md b/README.md index 0877c92..df83087 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,131 @@ The `refactor!` + `\` loop is allocation-free apart from the tiny `O(r)` dense p * **`:auto`** (default) — Woodbury, automatically falling back to the augmented system if `S` is singular or `C` is ill-conditioned. +### QR (the numerically stable path) + +`qr(A; strategy=:augmented)` factors the bordered system `[S U; V -I]` with +[**SparseColumnPivotedQR**](https://github.com/SciML/SparseColumnPivotedQR.jl) (a pure-Julia, +rank-revealing, column-pivoted sparse QR — the QR analogue of PureKLU), returning a +`SparseWithDenseRowColQRAugmented`. Like the augmented LU path it never forms `S⁻¹`, so it stays +accurate when the sparse part `S` is ill-conditioned or nearly singular but the dense rows +regularize `A` — the +[FastAlmostBandedMatrices](https://github.com/SciML/FastAlmostBandedMatrices.jl) regime, where +QR is the conventional choice. Measured forward error on `A = S + U V` with `κ(A) ≈ 6` as +`κ(S)` sweeps from `3.6e6` to `3.6e14` (reference: BigFloat dense solve): + +``` + κ(S) qr (augmented) factorize :woodbury, refine=2 factorize :augmented (LU) + 3.6e6 … 3.1e-16 (flat) 8.7e-17 → 4.5e-7 1.2e-16 (flat) + 3.6e14 +``` + +The Woodbury path's error grows with `κ(S)·κ(C)`; here (where the *assembled* `A` stays +well-conditioned, `κ(A) ≈ 6`) both augmented paths hold at the noise floor. + +**Be clear about what QR does and does not buy you.** Augmented QR is *not* more accurate than +the augmented **LU** path. It is backward-stable, so its forward error tracks `κ(A)·eps`; when +the *assembled* `A` is itself ill-conditioned the augmented-LU path's partial pivoting can be +**more** accurate on these structured matrices (measured: LU near the noise floor where QR sits +at `κ(A)·eps`, orders apart for `κ(A) ≳ 1e6`). QR's genuine, distinct properties versus +augmented-LU are that it is **rank-revealing** (a clean numerical-rank/singularity signal) and +that it is the QR-based interface users of almost-banded solvers expect — not higher accuracy, +which the LU path already delivers, and not throughput (it is slower; see the trade-offs +below). Reach for `qr` when you specifically want a QR factorization or its rank diagnostics; +otherwise `factorize`/`lu` is the better default. + +There is **no** `strategy=:woodbury` for `qr`: a Woodbury-over-`qr(S)` solve shares the same +`κ(S)·κ(C)` cancellation and is catastrophically inaccurate on ill-conditioned `S` (~`1e-1`), +so it is deliberately not offered. Because the backend is genuinely rank-revealing (column +pivoting), `qr` throws `SingularException` exactly when the bordered system is rank-deficient; +a merely ill-conditioned, accurately-solvable `A` is **not** rejected. + +The backend gives the QR path the same engineering niceties as the LU path: the solve is +**allocation-free**, [`refactor!`](@ref)/`qr!` **reuses the symbolic analysis** (column ordering +/ elimination tree) for a fixed sparsity pattern — so `qr` is usable in a Newton / time-stepping +hot loop, not only for one-off stable solves — and **any element type** works (the BLAS floats +plus generic numbers such as `BigFloat`/`ForwardDiff.Dual`, on every Julia version). Cost +(banded interior + `r` dense rows, factor / solve / refactor, ms): + +``` + n=2000 r=4 n=5000 r=8 + qr (augmented QR) : 89 / 2.8 / 60 620 / 15 / 450 ← rank-revealing; alloc-free solve + lu (augmented LU) : 54 / 1.7 / 18 610 / 11 / 180 ← klu! symbolic reuse on refactor + lu (Woodbury) : 1.3/ 0.1 / n/a 3.6 / 0.3 / n/a ← fast path (forms S⁻¹), ~50–60× +``` + +QR is now in the same ballpark as the augmented-LU path (a bit slower on factor/refactor; +allocation-free solve), though still far from the Woodbury fast path. Forward error is +comparable (~`1e-13`) for all three on a well-conditioned `A`. Reach for `qr` when you want a +genuinely rank-revealing factorization, generic-eltype support, or the QR interface; `factorize` +/`lu` (Woodbury) remains the throughput default. + +## Rank-deficient / inconsistent least squares (`lstsq`) + +`\`, `factorize`/`lu`, and `qr` solve **nonsingular** systems and throw `SingularException` +when `A` is singular. For a genuinely rank-deficient or inconsistent system, `lstsq(A, b)` +returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Penrose solution: of +all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`). + +```julia +x = lstsq(A, b) # :auto — structured direct, dense fallback (default) +x = lstsq(A, b; alg = :structured) # structure-exploiting direct solve (never densifies A) +x = lstsq(A, b; alg = :dense) # exact dense complete-orthogonal decomposition (gelsy) +x = lstsq(A, b; alg = :iterative) # LSQR/LSMR (needs `using IterativeSolvers`) +x = lstsq(A, b; λ = 1e-3) # Tikhonov-damped (ridge) variant +``` + +This is a **separate, opt-in** entry point — `\`/`factorize`/`qr` stay exact and keep throwing +on singular `A`; least squares never happens implicitly. The bordered/Woodbury machinery +*cannot* produce `A⁺b`: solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a different +objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent +systems), and there is no Woodbury-style formula for the pseudoinverse of a sum. + +Engines (`:auto` picks the first that applies): + +* **`:structured`** — a **direct** solve that **never densifies `A`**. When `S` is nonsingular, + `A = S(I + ZV)` with `Z = S⁻¹U`, and the *entire* rank deficiency collapses into the small + `r × r` capacitance `C = I + V Z` (`nullity(A) = nullity(C)`, since `det A = det S · det C`). + So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) + + small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100× + faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles + consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). A **singular + `S` with coordinate-aligned null structure** — the boundary-condition / `replace=true` case, + where the zeroed rows are supplied by a `SelectorMatrix` `U` — is handled by a sparse *peel* + `S̃ = S + U Uᴴ` (just `r` diagonal entries, so `S̃` stays sparse and is nonsingular) with + `A = S̃ + U(V − Uᴴ)`, keeping the direct structure-exploiting path. It errors only when `S` is + near-singular or singular with a *general* (dense) null space — where inverting `S` would be + silently wrong — so `:auto` falls back to `:dense` there. +* **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact + (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback + when `S` is near-singular, or singular with a null space that is *not* coordinate-aligned (so + the sparse peel does not apply and there is no structure-exploiting direct route — the + null-space correction would itself be dense). +* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never + forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor + `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution; + approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence. + Lives in an extension (`using IterativeSolvers`). + +`:auto` (the default) uses `:structured` when `S` is nonsingular (or singular with a +coordinate-aligned null space) and well-conditioned, otherwise the exact `:dense` COD — so you +get the fast structure-exploiting direct solve when it is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are +supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured +kernel and is useful on its own. + +For **repeated** least-squares solves with the same `A` (or a Newton / time-stepping loop), +build the factorization once and reuse it — the sparse factorization of `S` and the small dense +decompositions are cached, so each solve is a cheap back-solve: + +```julia +F = SparseWithDenseRowColLeastSquares(A) # structured setup once +x1 = F \ b1; ldiv!(x2, F, b2); … # each solve reuses S's factorization +``` + +Measured (`n = 2000`, `r = 6`): a reused solve is `0.05 ms` / `16 KB` versus the one-shot +`lstsq(A, b)` at `2.1 ms` / `2.6 MB` — **~44× faster**, since it skips re-factoring `S`. (The +structured `alg=:auto`/`:structured` path applies; for a general singular `S` the constructor +errors and you use `lstsq(A, b; alg = :dense)`.) + ## Benchmarks A banded sparse interior plus `r = 8` dense rows (`n = 5000`). Factoring only `S` and @@ -138,11 +263,17 @@ globally-scattered `S` degrades every sparse method.) * [x] Matrix–vector / matrix–matrix multiply (`mul!` vector path is allocation-free) * [x] Factorization and linear solve (`factorize`/`lu`, `\`, `ldiv!`) * [x] In-place refactorization with a fixed sparsity pattern (`refactor!`) -* [x] Adjoint / transpose solves -* [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) - -`qr` is intentionally not provided — a general sparse `QR` causes far more fill than the -LU-based KLU path; use `factorize`/`lu`. +* [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) +* [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury *and* QR +* [x] Rank-revealing QR (`qr`) for ill-conditioned `S`: allocation-free solve, symbolic-reuse + `refactor!`, any element type (column-pivoted, pure-Julia backend) +* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` — a + structure-exploiting **direct** solve (capacitance method) when `S` is nonsingular, >100× + faster than densifying; exact dense COD fallback otherwise + +`qr(A)` builds the augmented sparse-QR factorization (above): a rank-revealing column-pivoted +solve with an allocation-free `ldiv!` and a symbolic-reuse `refactor!`. It is comparable to the +augmented-LU path in cost; `factorize`/`lu` (Woodbury) is still the throughput default. ## When is this the right tool? @@ -248,17 +379,27 @@ not throw), again matching KLU. One caveat: once a singular `S` forces the augme the cache keeps reusing it even if later `S` values are nonsingular — `init` a fresh cache to get back the faster Woodbury path. +For the stable QR path, pass `SparseWithDenseRowColQRFactorization(; check_pattern)` instead. +It has no `strategy`/`refine` (only the augmented mode exists and augmented QR needs no +refinement); a value update reuses the QR symbolic analysis (`refactor!`) just like the LU +algorithm, and a singular `A` likewise returns `ReturnCode.Infeasible`. It is opt-in — the +default algorithm stays the LU one. + ## Public API ``` SparseWithDenseRowColMatrix SelectorMatrix sparsepart fillpart lowrankfactors exclusive_sparsepart denserank -factorize lu \\ ldiv! refactor! lu! update_lowrank! -SparseWithDenseRowColWoodbury SparseWithDenseRowColAugmented +factorize lu qr \\ ldiv! refactor! lu! qr! update_lowrank! +lstsq SparseWithDenseRowColLeastSquares # minimum-norm least squares (one-shot; cached/reusable) +SparseWithDenseRowColWoodbury SparseWithDenseRowColAugmented SparseWithDenseRowColQRAugmented recommend_lowrank_peel PeelRecommendation -SparseWithDenseRowColFactorization # LinearSolve.jl algorithm (extension) +SparseWithDenseRowColFactorization # LinearSolve.jl algorithm (LU/Woodbury, default) +SparseWithDenseRowColQRFactorization # LinearSolve.jl algorithm (stable QR, opt-in) ``` +The `:iterative` engine of `lstsq` requires `using IterativeSolvers` (a package extension). + ## Installation PureKLU.jl is not yet in the General registry; install it from the SciML repository: diff --git a/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl new file mode 100644 index 0000000..6441f10 --- /dev/null +++ b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl @@ -0,0 +1,53 @@ +module SparseWithDenseRowColMatricesIterativeSolversExt + +using SparseWithDenseRowColMatrices +using SparseWithDenseRowColMatrices: SparseWithDenseRowColMatrix, _check_lstsq_eltype +using IterativeSolvers +using LinearAlgebra: LinearAlgebra + +# Iterative engine for `lstsq(A, b; alg=:iterative)`. LSQR (default) / LSMR are driven entirely +# by A's structured matvec and adjoint (`mul!(y, A, x)` / `mul!(y, A', x)` defined in +# src/matvec.jl), so the dense `A = S + U·V` is never formed. Started from x0 = 0, both +# converge to the MINIMUM-NORM least-squares solution for rank-deficient / inconsistent systems +# (the min-norm guarantee is contingent on the zero start — the solver is never given a guess). +function SparseWithDenseRowColMatrices._lstsq_iterative( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + solver::Symbol = :lsqr, atol::Real = 1.0e-12, btol::Real = 1.0e-12, + maxiters::Integer = 20 * size(A, 1), λ::Real = 0, verbose::Bool = false, + # accept and ignore the dense-only keyword so `alg` can be flipped without edits + rcond = nothing, + ) + n = size(A, 1) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + _check_lstsq_eltype(promote_type(eltype(A), eltype(b))) # reject non-BLAS-float up front + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + + x, ch = if solver === :lsqr + IterativeSolvers.lsqr(A, b; damp = λ, atol = atol, btol = btol, maxiter = maxiters, log = true) + elseif solver === :lsmr + IterativeSolvers.lsmr(A, b; λ = λ, atol = atol, btol = btol, maxiter = maxiters, log = true) + else + throw(ArgumentError("solver must be :lsqr or :lsmr; got :$solver")) + end + + # IterativeSolvers' `isconverged` is unreliable here (lsqr reports success even when it + # stops at the iteration cap with a wrong answer), so flag on budget exhaustion — which + # cleanly separates converged from non-converged for both lsqr and lsmr — and report the + # true least-squares optimality ‖Aᴴ(Ax−b)‖ (one cheap structured adjoint matvec; →0 at a + # minimizer) rather than trusting the solver's own flag. + capped = ch.iters ≥ maxiters + verbose && @info "lstsq(alg=:iterative)" solver converged = !capped iters = ch.iters + if capped + aresid = LinearAlgebra.norm(A' * (A * x .- b)) + @warn( + "lstsq(...; alg=:iterative, solver=:$solver) hit the $maxiters-iteration budget " * + "without converging — the result is NOT a reliable least-squares solution. " * + "Raise `maxiters`, or use the exact `alg=:dense` if `A` is small enough to densify.", + least_squares_residual = aresid, + iters = ch.iters, + ) + end + return x +end + +end # module diff --git a/ext/SparseWithDenseRowColMatricesLinearSolveExt.jl b/ext/SparseWithDenseRowColMatricesLinearSolveExt.jl index 880d332..e241e57 100644 --- a/ext/SparseWithDenseRowColMatricesLinearSolveExt.jl +++ b/ext/SparseWithDenseRowColMatricesLinearSolveExt.jl @@ -2,10 +2,11 @@ module SparseWithDenseRowColMatricesLinearSolveExt using SparseWithDenseRowColMatrices using SparseWithDenseRowColMatrices: SparseWithDenseRowColMatrix, SparseWithDenseRowColWoodbury, - SparseWithDenseRowColAugmented, denserank, refactor!, SparseWithDenseRowColFactorization + SparseWithDenseRowColAugmented, SparseWithDenseRowColQRAugmented, denserank, refactor!, + SparseWithDenseRowColFactorization, SparseWithDenseRowColQRFactorization using LinearSolve using LinearSolve: LinearCache, OperatorAssumptions, LinearVerbosity, AbstractSparseFactorization -using LinearAlgebra: LinearAlgebra, ldiv!, issuccess, Factorization, SingularException, factorize +using LinearAlgebra: LinearAlgebra, ldiv!, issuccess, Factorization, SingularException, factorize, qr # LinearSolve `@reexport using SciMLBase`, so SciMLBase is reachable through it. const SciMLBase = LinearSolve.SciMLBase @@ -105,4 +106,67 @@ function SciMLBase.solve!(cache::LinearCache, alg::SWDRCFactorizationAlg; kwargs end end +# ------------------ +# QR algorithm (numerically stable, opt-in) +# ------------------ +# Mirrors SWDRCFactorizationAlg but factors via the augmented sparse QR. There is no +# `reuse_symbolic`/`strategy`/`refine`: SuiteSparseQR has no symbolic-reuse refactor (a value +# update always re-`qr`s), only the augmented mode exists, and augmented QR needs no refinement. +struct SWDRCQRFactorizationAlg <: AbstractSparseFactorization + check_pattern::Bool +end + +function SparseWithDenseRowColMatrices.SparseWithDenseRowColQRFactorization(; check_pattern::Bool = true) + return SWDRCQRFactorizationAlg(check_pattern) +end + +function LinearSolve.init_cacheval( + ::SWDRCQRFactorizationAlg, A::SparseWithDenseRowColMatrix{T}, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, + verbose::Union{LinearVerbosity, Bool}, assumptions::OperatorAssumptions + ) where {T} + return SWDRCCacheval{T}(nothing) +end + +# Refresh the cached QR factorization. `refactor!` re-`qr`s in place (reusing the buffers and +# the bordered-matrix shape check); it throws on a singular `A` or a changed pattern, in which +# case we rebuild with `qr`. A genuinely singular `A` makes `qr` throw `SingularException`, +# which we map to a `nothing` cache → `Infeasible` (matching the LU algorithm's contract). +function _refresh_qr!(wrap::SWDRCCacheval, A::SparseWithDenseRowColMatrix, alg::SWDRCQRFactorizationAlg) + f = wrap.fact + if f isa SparseWithDenseRowColQRAugmented && + size(f) == size(A) && denserank(f) == denserank(A) + try + refactor!(f, A; check = alg.check_pattern) + return wrap + catch e + (e isa SingularException || e isa ArgumentError || e isa DimensionMismatch) || rethrow(e) + end + end + try + wrap.fact = qr(A) + catch e + e isa SingularException || rethrow(e) + wrap.fact = nothing + end + return wrap +end + +function SciMLBase.solve!(cache::LinearCache, alg::SWDRCQRFactorizationAlg; kwargs...) + if cache.isfresh + wrap = LinearSolve.@get_cacheval(cache, :SWDRCQRFactorizationAlg) + _refresh_qr!(wrap, cache.A, alg) + cache.isfresh = false + end + F = LinearSolve.@get_cacheval(cache, :SWDRCQRFactorizationAlg).fact + return if F !== nothing && issuccess(F) + y = ldiv!(cache.u, F, cache.b) + SciMLBase.build_linear_solution(alg, y, nothing, cache; retcode = SciMLBase.ReturnCode.Success) + else + SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = SciMLBase.ReturnCode.Infeasible + ) + end +end + end # module diff --git a/src/SparseWithDenseRowColMatrices.jl b/src/SparseWithDenseRowColMatrices.jl index 5119ff8..5205bef 100644 --- a/src/SparseWithDenseRowColMatrices.jl +++ b/src/SparseWithDenseRowColMatrices.jl @@ -1,12 +1,14 @@ module SparseWithDenseRowColMatrices using LinearAlgebra -using LinearAlgebra: Factorization, SingularException, ldiv!, lu, lu!, mul!, opnorm, +using LinearAlgebra: Factorization, SingularException, ldiv!, lu, lu!, qr, qr!, mul!, opnorm, issuccess, factorize using SparseArrays using SparseArrays: AbstractSparseMatrixCSC, SparseMatrixCSC, nonzeros, rowvals, getcolptr, nnz, sparse using PureKLU +using SparseColumnPivotedQR +using SparseMatricesCSR: SparseMatrixCSR import PrecompileTools: @setup_workload, @compile_workload @@ -15,13 +17,18 @@ include("matrix.jl") include("matvec.jl") include("woodbury.jl") include("augmented.jl") +include("qr.jl") include("factorize.jl") +include("lstsq.jl") include("detect.jl") export SparseWithDenseRowColMatrix, SelectorMatrix, sparsepart, fillpart, lowrankfactors, exclusive_sparsepart, denserank, - SparseWithDenseRowColWoodbury, SparseWithDenseRowColAugmented, refactor!, update_lowrank!, - recommend_lowrank_peel, PeelRecommendation, SparseWithDenseRowColFactorization + SparseWithDenseRowColWoodbury, SparseWithDenseRowColAugmented, + SparseWithDenseRowColQRAugmented, refactor!, update_lowrank!, lstsq, + SparseWithDenseRowColLeastSquares, + recommend_lowrank_peel, PeelRecommendation, + SparseWithDenseRowColFactorization, SparseWithDenseRowColQRFactorization # --------------------- # Precompilation diff --git a/src/factorize.jl b/src/factorize.jl index 3eebbfa..3a6fb6f 100644 --- a/src/factorize.jl +++ b/src/factorize.jl @@ -70,10 +70,49 @@ Alias for [`refactor!`](@ref) — refactor `F` in place with the new values of ` LinearAlgebra.lu!(F::SparseWithDenseRowColFactorization, A::SparseWithDenseRowColMatrix; kwargs...) = refactor!(F, A; kwargs...) -function LinearAlgebra.qr(::SparseWithDenseRowColMatrix) - throw(ArgumentError("SparseWithDenseRowColMatrix is solved by an LU-based factorization; use `factorize`/`lu`, not `qr`.")) +""" + qr(A::SparseWithDenseRowColMatrix; strategy=:auto) -> SparseWithDenseRowColQRAugmented + +QR-factorize `A = S + U V` for numerically stable repeated solves. Builds a single +rank-revealing, column-pivoted sparse QR ([SparseColumnPivotedQR](https://github.com/SciML/SparseColumnPivotedQR.jl)) +of the bordered system `[S U; V -I]`, returning a [`SparseWithDenseRowColQRAugmented`](@ref). +Unlike [`factorize`](@ref)/[`lu`](@ref) (the LU/Woodbury throughput path) this never forms +`S⁻¹`, so it stays accurate even when the sparse part `S` is ill-conditioned or nearly singular, +as long as `A` itself is nonsingular — the FastAlmostBandedMatrices regime. + +* `strategy = :auto` (default) or `:augmented`: the augmented bordered-system QR (the only mode). +* `strategy = :woodbury`: **not supported** — a Woodbury-over-qr(S) approach shares the + κ(S)·κ(C) cancellation of the LU Woodbury path and is catastrophically inaccurate on + ill-conditioned `S`, defeating the purpose of using QR. Throws. + +The solve is **allocation-free** and [`refactor!`](@ref) / [`qr!`](@ref) **reuses the symbolic +analysis** (column ordering / elimination tree) for a fixed sparsity pattern, so the QR path is +usable in a Newton / time-stepping hot loop, not only for one-off stable solves. Any element +type the backend supports works (the BLAS floats plus generic numbers such as `BigFloat` and +`ForwardDiff.Dual`). Solve with `F \\ b` / `ldiv!(F, b)`; update values with `refactor!`/`qr!`. +""" +function LinearAlgebra.qr(A::SparseWithDenseRowColMatrix; strategy::Symbol = :auto) + strategy in (:auto, :augmented) && return _augmented_qr(A) + strategy === :woodbury && throw( + ArgumentError( + "qr does not support strategy = :woodbury: a Woodbury-over-qr(S) solve shares the \ + κ(S)·κ(C) cancellation of the LU Woodbury path and is inaccurate on ill-conditioned S, \ + defeating the purpose of QR. Use `qr(A; strategy = :augmented)` (the default), or \ + `factorize(A; strategy = :woodbury)` for the LU Woodbury path." + ) + ) + throw(ArgumentError("strategy must be :auto or :augmented for qr; got :$strategy")) end +""" + qr!(F::SparseWithDenseRowColQRAugmented, A::SparseWithDenseRowColMatrix; kwargs...) -> F + +Alias for [`refactor!`](@ref) — re-`qr` `F` in place with the new values of `A`. Note this is a +full re-factorization (SuiteSparseQR has no symbolic reuse), unlike [`lu!`](@ref). +""" +LinearAlgebra.qr!(F::SparseWithDenseRowColQRAugmented, A::SparseWithDenseRowColMatrix; kwargs...) = + refactor!(F, A; kwargs...) + Base.:\(A::SparseWithDenseRowColMatrix, b::AbstractVecOrMat) = factorize(A) \ b """ @@ -97,3 +136,20 @@ It is also the default algorithm `LinearSolve` picks for a `SparseWithDenseRowCo Only available when `LinearSolve` is loaded (provided by a package extension). """ function SparseWithDenseRowColFactorization end + +""" + SparseWithDenseRowColQRFactorization(; check_pattern=true) + +A [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) algorithm that solves +`SparseWithDenseRowColMatrix` systems with the numerically stable augmented sparse QR +([`SparseWithDenseRowColQRAugmented`](@ref)) through LinearSolve's caching interface. Opt-in +(the default LinearSolve algorithm for the matrix type remains the LU +[`SparseWithDenseRowColFactorization`](@ref)); choose this when `S` is ill-conditioned. + +* `check_pattern` — validate the pattern on each refactor (`false` skips the check). + +There is no `reuse_symbolic` option (SuiteSparseQR has no symbolic-reuse refactor, so a value +update always re-`qr`s), no `strategy` (only the augmented mode exists), and no `refine` (the +augmented QR needs no Woodbury refinement). Only available when `LinearSolve` is loaded. +""" +function SparseWithDenseRowColQRFactorization end diff --git a/src/lstsq.jl b/src/lstsq.jl new file mode 100644 index 0000000..60b5844 --- /dev/null +++ b/src/lstsq.jl @@ -0,0 +1,349 @@ +# ------------------ +# Rank-deficient / inconsistent least squares: minimum-norm solution x = A⁺b +# ------------------ +# +# `\`, `factorize`/`lu`, and `qr` solve NONSINGULAR systems and throw on a singular `A`. For a +# genuinely rank-deficient or inconsistent system one wants the Moore–Penrose solution +# x = A⁺b (the minimum-2-norm vector among the minimizers of ‖Ax−b‖). The bordered/Woodbury +# machinery CANNOT produce it — solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a +# different objective than `‖Ax−b‖`, and there is no Woodbury-style formula for the +# pseudoinverse of a sum. So this is a separate, opt-in entry point. +# +# Engines: +# :auto (default) — try :structured, fall back to :dense when S is singular/near-singular. +# :structured — DIRECT and structure-exploiting (no densifying). When S is nonsingular, +# A = S(I + ZV) with Z = S⁻¹U, and the entire rank deficiency collapses +# into the small r×r capacitance C = I + V Z (nullity(A) = nullity(C)). +# So A⁺b is built from ONE sparse factorization of S (PureKLU) + r solves +# (Z) + small dense SVD/QR on r×r and n×s (s ≤ 2r) blocks — never forming +# or factoring the dense A. O(factor(S) + n·r²); >100× faster than dense +# COD for large n. Requires S nonsingular & well-conditioned (guarded). +# :dense — complete-orthogonal decomposition (LAPACK gelsy) of the densified A. +# Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory; the +# mandatory fallback when S is singular/near-singular (where the +# structured route is silently wrong). +# :iterative — LSQR/LSMR (IterativeSolvers) driven by A's structured matvec/adjoint, +# never forming the dense A. For n too large to densify when S is also +# singular (so neither :structured nor :dense apply). Approximate to a +# tolerance, fragile under ill-conditioning. Lives in an extension. + +_lstsq_supported(::Type{<:Union{Float32, Float64, ComplexF32, ComplexF64}}) = true +_lstsq_supported(::Type) = false + +function _check_lstsq_eltype(::Type{T}) where {T} + _lstsq_supported(T) || throw( + ArgumentError( + "lstsq supports only Float32/Float64/ComplexF32/ComplexF64 (it uses LAPACK / a BLAS-float " * + "iterative solver); got eltype $T." + ) + ) + return nothing +end + +# Iterative engine — method added by the IterativeSolvers extension; no method in core. +function _lstsq_iterative end + +""" + lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:auto, kwargs...) -> x + +Minimum-norm least-squares solution `x ≈ A⁺b` (the Moore–Penrose pseudoinverse solution), +correct for **rank-deficient and/or inconsistent** systems where `\\`/`factorize`/`qr` would +throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`. Both the +consistent and inconsistent cases are handled (no user flag). + +* `alg = :auto` (default): use the structure-exploiting **direct** method when `S` is + nonsingular and well-conditioned, automatically falling back to `:dense` otherwise. +* `alg = :structured`: a **direct** solve that never densifies `A`. When `S` is nonsingular the + rank deficiency collapses into the small `r × r` capacitance `C = I + V S⁻¹U`, so `A⁺b` comes + from one sparse factorization of `S` (PureKLU) plus small dense work on `r × r` / `n × s` + (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, `>100×` faster than `:dense` for large `n`. Errors + if `S` is singular/near-singular (where the structured route would be silently wrong). +* `alg = :dense`: complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. + Exact, `O(n³)` / `O(n²)`; the mandatory fallback when `S` is singular. +* `alg = :iterative`: LSQR / LSMR via `A`'s structured matvec/adjoint, never densifying — for + `n` too large to densify *and* `S` singular. Requires `using IterativeSolvers`; approximate. + +Keywords: `tolC` (`:structured`/`:auto` rank cutoff for `C`), `rcond` (`:dense` rank cutoff), +`solver` (`:lsqr`/`:lsmr`), `atol`, `btol`, `maxiters`, `λ` (Tikhonov damping; `0` = pure +minimum-norm), `verbose`. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. + +`\\`, `factorize`/`lu`, and `qr` are unchanged — they stay exact and throw on singular `A`; +least squares is always an explicit opt-in via `lstsq`. +""" +function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :auto, kwargs...) + if alg === :auto + # structured can't do Tikhonov; route λ>0 (and structured-unsafe S) to the exact dense COD + if iszero(get(kwargs, :λ, 0)) + x, ok = _lstsq_structured(A, b; kwargs...) + ok && return x + end + return _lstsq_dense(A, b; kwargs...) + elseif alg === :structured + x, ok = _lstsq_structured(A, b; kwargs...) + ok || throw( + ArgumentError( + "lstsq(...; alg=:structured) is unsafe for this A: S is singular or near-singular, " * + "so the structured route (which inverts S) would be silently wrong. Use alg=:dense " * + "(exact) or alg=:auto (auto-fallback)." + ) + ) + return x + elseif alg === :dense + return _lstsq_dense(A, b; kwargs...) + elseif alg === :iterative + isempty(methods(_lstsq_iterative)) && throw( + ArgumentError( + "lstsq(...; alg=:iterative) requires `using IterativeSolvers` (the iterative engine " * + "lives in a package extension)." + ) + ) + return _lstsq_iterative(A, b; kwargs...) + else + throw(ArgumentError("alg must be :auto, :structured, :dense or :iterative; got :$alg")) + end +end + +# Sparse "peel" for a SINGULAR S whose null structure is coordinate-aligned (the BVP / +# boundary-condition case: `replace=true` zeros the selector rows of S, U = [I_r; 0]). The +# identity A = S + U V = (S + U Uᴴ) + U(V − Uᴴ) re-splits A with S̃ = S + U Uᴴ; for a selector U, +# U Uᴴ is just `r` diagonal entries, so S̃ stays SPARSE and is typically nonsingular (the zeroed +# rows get a unit diagonal back), and the rank is unchanged. The structured method then applies +# to (S̃, U, V − Uᴴ). For a dense U, U Uᴴ is dense (S̃ dense) so no sparse peel exists — return +# `nothing` and let the caller fall back to the dense COD. +_peel_selector(S, U, V) = nothing +function _peel_selector(S::SparseMatrixCSC{T}, U::SelectorMatrix, V::AbstractMatrix{T}) where {T} + r = U.r + r == 0 && return nothing + n = size(S, 1) + Stilde = S + sparse(1:r, 1:r, ones(T, r), n, n) # S̃ = S + U Uᴴ (sparse) + Vtilde = copy(V) + @inbounds for k in 1:r + Vtilde[k, k] -= one(T) # Ṽ = V − Uᴴ + end + return (Stilde, Vtilde) +end + +# Factor S; on (exact) singularity try the sparse selector-peel and factor S̃ instead. Returns +# `(Sfact, S_effective, V_effective)` or `nothing` (caller falls back to the dense COD). +function _klu_or_peel(Sown::SparseMatrixCSC, U, Vmat) + try + return (PureKLU.klu(Sown), Sown, Vmat) + catch e + e isa SingularException || rethrow(e) + end + peeled = _peel_selector(Sown, U, Vmat) + peeled === nothing && return nothing + Stilde, Vtilde = peeled + try + return (PureKLU.klu(Stilde), Stilde, Vtilde) + catch e + e isa SingularException || rethrow(e) + return nothing # S̃ still singular → dense fallback + end +end + +# Structure-exploiting DIRECT min-norm least-squares (returns `(x, ok)`; `ok=false` signals the +# caller to fall back to dense because S is singular/near-singular). With S nonsingular, +# A = S(I + ZV), Z = S⁻¹U; the rank deficiency lives in C = I + V Z (nullity(A)=nullity(C)). +# A⁺b is assembled from one PureKLU factorization of S + small dense SVD/QR — A is never formed. +# A SINGULAR S with coordinate-aligned null structure (selector U) is first peeled to a sparse +# nonsingular S̃ (see `_peel_selector`); a general singular S falls back to the dense COD. +""" + SparseWithDenseRowColLeastSquares{T} <: LinearAlgebra.Factorization{T} + +Cached structure-exploiting least-squares factorization of an [`SparseWithDenseRowColMatrix`](@ref) +`A = S + U V`, for **repeated** minimum-norm least-squares solves (e.g. a Newton / time-stepping +loop): the expensive work — the sparse factorization of `S` (PureKLU) and the small dense +SVD/QR — is done once at construction, so each `F \\ b` / `ldiv!(F, b)` is a cheap back-solve +that **reuses** it. `F \\ b` returns `A⁺b` exactly as [`lstsq`](@ref)`(A, b)` does. + +Constructed by `SparseWithDenseRowColLeastSquares(A)`. Only valid where the structured direct +method applies (`S` nonsingular, or singular with a coordinate-aligned null space); it errors +otherwise — use `lstsq(A, b; alg = :dense)` for a general singular `S`. +""" +struct SparseWithDenseRowColLeastSquares{T, FT} <: LinearAlgebra.Factorization{T} + Sfact::FT # PureKLU factorization of the (possibly peeled) S̃ + W::Matrix{T} # n × s active subspace basis + Msp::Matrix{T} # s × s, the assembled Mᴴ⁺ on the active block + QL::Matrix{T} # n × kdef left-null basis (range-of-A projector); n × 0 if full rank + n::Int + r::Int + kdef::Int + cbuf::Vector{T} # length-n scratch reused across solves +end + +Base.size(F::SparseWithDenseRowColLeastSquares) = (F.n, F.n) +Base.size(F::SparseWithDenseRowColLeastSquares, i::Integer) = i ≤ 2 ? F.n : 1 +denserank(F::SparseWithDenseRowColLeastSquares) = F.r + +# Expensive SETUP: factor S (or peel a selector-singular S̃) and build the small dense +# decompositions. Returns the cached factorization, or `nothing` to signal "fall back to dense" +# (S singular/near-singular with a non-sparse null space). Used by both the one-shot `lstsq` +# and the reusable factorization object. +function _structured_setup(A::SparseWithDenseRowColMatrix, ::Type{RT}; tolC::Real) where {RT} + TT = float(RT) + rT = real(TT) + n = size(A, 1) + r = denserank(A) + tol = tolC < 0 ? sqrt(eps(rT)) : rT(tolC) + + Sown = _own_sparse(TT, A.S) + Vmat = r > 0 ? Matrix{TT}(A.V) : Matrix{TT}(undef, 0, n) + fac = _klu_or_peel(Sown, A.U, Vmat) # factor S, or peel a selector-singular S to S̃ + fac === nothing && return nothing # S (and any peel) singular → fall back to dense + Sfact, Sown, V = fac # the *effective* (possibly peeled) factors + + empty_n0 = Matrix{TT}(undef, n, 0) + cbuf = Vector{TT}(undef, n) + r == 0 && return SparseWithDenseRowColLeastSquares{TT, typeof(Sfact)}( # A = S nonsingular + Sfact, empty_n0, Matrix{TT}(undef, 0, 0), empty_n0, n, 0, 0, cbuf + ) + + Z = Matrix{TT}(undef, n, r) + materialize_U!(Z, A.U) # Z = U + nrmU = norm(Z) + PureKLU.solve!(Sfact, Z) # Z = S̃⁻¹U + + # Guard: near-singular S makes Z = S⁻¹U huge/garbage and `klu` does NOT throw, which both + # crashes the SVDs and gives a silently-wrong answer. κ̂(S) = ‖S‖₁·‖Z‖/‖U‖ ≳ κ(S); bail early. + kappaS = nrmU > 0 ? opnorm(Sown, 1) * norm(Z) / nrmU : zero(rT) + (all(isfinite, Z) && isfinite(kappaS) && kappaS ≤ inv(sqrt(eps(rT)))) || return nothing + + Vadj = Matrix(V') # n × r (adjoint; conjugate for complex) + C = V * Z # r × r capacitance + @inbounds for i in 1:r + C[i, i] += one(TT) + end + Fc = svd(C) + kdef = count(<(tol * Fc.S[1]), Fc.S) # nullity(A) = nullity(C) + + # orthonormal basis W (n × s, s ≤ 2r) of the active subspace span([Z | Vᴴ]); M is the + # identity off this subspace, so Mᴴ-pseudoinverse work happens only on the small s × s block. + QB = qr(hcat(Z, Vadj), ColumnNorm()) + Rd = abs.(diag(QB.R)) + s = isempty(Rd) ? 0 : count(>(tol * maximum(Rd)), Rd) + W = Matrix(QB.Q)[:, 1:s] + Msmall = W' * (Z * (V * W)) # s × s + @inbounds for i in 1:s + Msmall[i, i] += one(TT) + end + Fs = svd(Msmall) + sp = [σ > tol * Fs.S[1] ? inv(σ) : zero(rT) for σ in Fs.S] # truncated pinv of Msmall + Msp = Fs.V * Diagonal(sp) * Fs.U' # s × s assembled Mᴴ⁺ on the active block + + QL = if kdef > 0 + Nl = Fc.U[:, (end - kdef + 1):end] # null(Cᴴ) (r × kdef) + LrA = Vadj * Nl # n × kdef + PureKLU.solve!(Sfact', LrA) # left-null(A) = S⁻ᴴ Vᴴ null(Cᴴ) + Matrix(qr(LrA).Q)[:, 1:kdef] + else + empty_n0 + end + return SparseWithDenseRowColLeastSquares{TT, typeof(Sfact)}(Sfact, W, Msp, QL, n, r, kdef, cbuf) +end + +# Cheap APPLY: x = A⁺b from the cached factorization (no re-factorization of S). Reads `b` fully +# before writing `x`, so `x === b` (in-place) is allowed. +function _structured_apply!(x::AbstractVector, F::SparseWithDenseRowColLeastSquares{T}, b::AbstractVector) where {T} + c = F.cbuf + copyto!(c, b) + if F.kdef > 0 # project b onto range(A): c = b − QL(QLᴴ b) + mul!(c, F.QL, F.QL' * b, -one(T), one(T)) + end + PureKLU.solve!(F.Sfact, c) # c = S̃⁻¹ b̃ + if size(F.W, 2) > 0 + Wtc = F.W' * c # s-vector + copyto!(x, c) + mul!(x, F.W, Wtc, -one(T), one(T)) # x = c − W Wᴴc + mul!(x, F.W, F.Msp * Wtc, one(T), one(T)) # x += W Mᴴ⁺ Wᴴc + else + copyto!(x, c) + end + return x +end + +# One-shot structured solve (returns `(x, ok)`; `ok=false` ⇒ caller falls back to dense). Setup + +# apply + a post-hoc least-squares-optimality check on the actual `b` (catches a misjudged rank). +function _lstsq_structured( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + tolC::Real = -one(real(float(eltype(A)))), λ::Real = 0, + # accept and ignore the other engines' keywords so `alg` can be flipped without edits + rcond = nothing, solver = nothing, atol = nothing, btol = nothing, + maxiters = nothing, verbose = nothing, + ) + n = size(A, 1) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + iszero(λ) || throw(ArgumentError("lstsq(...; alg=:structured) does not support Tikhonov λ>0; use alg=:dense.")) + RT = promote_type(eltype(A), eltype(b)) + _check_lstsq_eltype(RT) + TT = float(RT) + F = _structured_setup(A, RT; tolC = tolC) + F === nothing && return (zeros(TT, n), false) + x = _structured_apply!(Vector{TT}(undef, n), F, collect(TT, b)) + + # post-hoc least-squares optimality: ‖Aᴴ(Ax−b)‖ → 0 at a minimizer; a loose threshold + # separates a good solution (~1e-14 relative) from a grossly wrong one. A false trip only + # falls `:auto` back to the exact dense path, so erring conservative is safe. + res = A * x + res .-= b + atr = A' * res + nrm_atr = norm(atr) + ok = nrm_atr ≤ 1.0e-6 * (norm(A' * collect(TT, b)) + nrm_atr) + return (x, ok) +end + +""" + SparseWithDenseRowColLeastSquares(A::SparseWithDenseRowColMatrix; tolC=…) -> F + +Build a reusable structured least-squares factorization for repeated `A⁺b` solves (see +[`SparseWithDenseRowColLeastSquares`](@ref)). Errors if the structured direct method does not +apply to `A` (general singular/near-singular `S`); use `lstsq(A, b; alg = :dense)` there. +""" +function SparseWithDenseRowColLeastSquares(A::SparseWithDenseRowColMatrix; tolC::Real = -one(real(float(eltype(A))))) + _check_lstsq_eltype(eltype(A)) + F = _structured_setup(A, eltype(A); tolC = tolC) + F === nothing && throw( + ArgumentError( + "a structured least-squares factorization does not apply to this A (S is singular or " * + "near-singular with a non-sparse null space); use `lstsq(A, b; alg = :dense)`." + ) + ) + return F +end + +LinearAlgebra.ldiv!(F::SparseWithDenseRowColLeastSquares, b::AbstractVector) = + (length(b) == F.n || throw(DimensionMismatch()); _structured_apply!(b, F, b)) +LinearAlgebra.ldiv!(x::AbstractVector, F::SparseWithDenseRowColLeastSquares, b::AbstractVector) = + (length(b) == F.n || throw(DimensionMismatch()); _structured_apply!(x, F, b)) +function Base.:\(F::SparseWithDenseRowColLeastSquares{T}, b::AbstractVector) where {T} + length(b) == F.n || throw(DimensionMismatch()) + return _structured_apply!(Vector{T}(undef, F.n), F, eltype(b) === T ? b : convert(Vector{T}, b)) +end + +function _lstsq_dense( + A::SparseWithDenseRowColMatrix, b::AbstractVector; + rcond::Real = -one(real(float(eltype(A)))), λ::Real = 0, + # accept and ignore iterative-only keywords so `alg` can be flipped without edits + solver = nothing, atol = nothing, btol = nothing, maxiters = nothing, verbose = nothing, + ) + n = size(A, 1) + length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))")) + λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ")) + _check_lstsq_eltype(promote_type(eltype(A), eltype(b))) # reject non-BLAS-float (e.g. Int) up front + TT = float(promote_type(eltype(A), eltype(b))) + rc = rcond < 0 ? eps(real(TT)) * n : real(TT)(rcond) + M = Matrix(A) + Md = eltype(M) === TT ? copy(M) : TT.(M) + if iszero(λ) + rhs = collect(TT, b) + x, _ = LinearAlgebra.LAPACK.gelsy!(Md, rhs, rc) + return x + else + # Tikhonov: minimize ‖A x − b‖² + λ²‖x‖² = ‖[A; λI] x − [b; 0]‖² + stacked = vcat(Md, real(TT)(λ) * Matrix{TT}(LinearAlgebra.I, n, n)) + rhs = vcat(collect(TT, b), zeros(TT, n)) + x, _ = LinearAlgebra.LAPACK.gelsy!(stacked, rhs, rc) + return x[1:n] + end +end diff --git a/src/matvec.jl b/src/matvec.jl index 9359f9a..b38a0fe 100644 --- a/src/matvec.jl +++ b/src/matvec.jl @@ -115,3 +115,57 @@ function Base.:*(A::SparseWithDenseRowColMatrix, X::AbstractMatrix) Y = Matrix{T}(undef, size(A, 1), size(X, 2)) return mul!(Y, A, X, one(T), zero(T)) end + +# ------------------ +# Adjoint / transpose matvec: Aᴴu = Sᴴu + Vᴴ(Uᴴu) (and Aᵀ with no conjugation) +# ------------------ +# Without these, `mul!(y, A', u)` / `A'*u` fall back to LinearAlgebra's generic `getindex` +# path, which materializes columns of `A` and is ~1000× slower than the structured product — +# crippling for an iterative least-squares solve, whose every step needs an adjoint matvec. + +# w = Uᴴu (length r). Selector U = [I_r; 0] ⇒ (Uᴴu)_k = u_k. +function _lowrank_adjU!(w, U::AbstractMatrix, u, cnj) + @inbounds for k in axes(U, 2) + s = zero(eltype(w)) + for i in axes(U, 1) + s += cnj(U[i, k]) * u[i] + end + w[k] = s + end + return w +end +function _lowrank_adjU!(w, U::SelectorMatrix, u, cnj) + @inbounds for k in 1:U.r + w[k] = u[k] + end + return w +end + +# `Sop` is `adjoint`/`transpose`, `cnj` is `conj`/`identity` (for the dense U/V blocks). +function _adjoint_matvec!( + y::AbstractVector, A::SparseWithDenseRowColMatrix, u::AbstractVector, + α::Number, β::Number, Sop, cnj + ) + mul!(y, Sop(A.S), u, α, β) # y .= α Sᴴu + β y + U, V = A.U, A.V + r = size(V, 1) + w = Vector{promote_type(eltype(A), eltype(u))}(undef, r) + _lowrank_adjU!(w, U, u, cnj) # w = Uᴴu (length r) + @inbounds for k in 1:r + wk = α * w[k] + for j in axes(V, 2) + y[j] += cnj(V[k, j]) * wk # y .+= α Vᴴw + end + end + return y +end + +for (Wrap, Sop, cnj) in ((:Adjoint, :adjoint, :conj), (:Transpose, :transpose, :identity)) + @eval LinearAlgebra.mul!( + y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector, + α::Number, β::Number, + ) = _adjoint_matvec!(y, parent(wA), u, α, β, $Sop, $cnj) + @eval LinearAlgebra.mul!( + y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector, + ) = mul!(y, wA, u, true, false) +end diff --git a/src/qr.jl b/src/qr.jl new file mode 100644 index 0000000..607883f --- /dev/null +++ b/src/qr.jl @@ -0,0 +1,168 @@ +# ------------------ +# Augmented QR factorization (numerically stable path) +# ------------------ +# +# Solve A x = b with A = S + U V via the bordered system +# +# [ S U ] [x] [b] +# [ V -I ] [y] = [0] +# +# factored as ONE sparse, rank-revealing, column-pivoted QR (SparseColumnPivotedQR's `csr_qr`) +# of size (n+r). This is the numerically stable analogue of the augmented LU path: it never +# forms `S⁻¹`, so it stays accurate even when the sparse part `S` is ill-conditioned or nearly +# singular but the dense rows/columns regularize `A` (the FastAlmostBandedMatrices regime). The +# bordered system is full-rank and well-conditioned whenever `A` itself is nonsingular, +# regardless of κ(S) — which is why a Woodbury-over-qr(S) approach is NOT shipped: it shares the +# κ(S)·κ(C) cancellation and is catastrophically wrong once κ(S) ≳ 1e11, while this path holds +# near the noise floor across the whole range. +# +# Backend: SparseColumnPivotedQR.jl (pure Julia, like PureKLU is for LU). Versus a SuiteSparseQR +# backend this gives (a) an allocation-free in-place solve, (b) a symbolic-reuse `csr_refactor!` +# for the Newton/time-stepping hot path, (c) genuine column-pivoted rank revelation (so a +# singular `A` is detected directly from `rank`), and (d) generic element types. + +""" + SparseWithDenseRowColQRAugmented{T} <: LinearAlgebra.Factorization{T} + +QR factorization of a [`SparseWithDenseRowColMatrix`](@ref) `A = S + U V`, built as a single +rank-revealing, column-pivoted sparse QR (SparseColumnPivotedQR) of the bordered system +`[S U; V -I]` of size `n + r`. This is the numerically stable analogue of +[`SparseWithDenseRowColAugmented`](@ref) (which uses LU): it never forms `S⁻¹`, so it stays +accurate even when `S` is ill-conditioned or nearly singular, as long as `A` itself is +nonsingular. Created by [`qr`](@ref). + +The solve is allocation-free, [`refactor!`](@ref) reuses the symbolic analysis (so it is the +throughput path for fixed-pattern Newton loops as well as the stability path), and any element +type the backend supports is allowed (the BLAS floats plus generic numbers such as `BigFloat` +and `ForwardDiff.Dual`). The adjoint/transpose solve uses a separately built, lazily cached +factorization of the bordered system's adjoint. +""" +mutable struct SparseWithDenseRowColQRAugmented{T, QF} <: LinearAlgebra.Factorization{T} + Qaug::QF # CSRQRFactorization of [S U; V -I] + QaugH::Any # nothing until first adjoint/transpose solve, then a CSRQRFactorization + MaugC::SparseMatrixCSC{T, Int} # owned bordered matrix (CSC; source for refactor! and the adjoint) + n::Int + r::Int + rankaug::Int # numerical rank of Qaug (== n+r ⇒ A nonsingular) + rhs::Vector{T} # length n+r — [b; 0] / RHS scratch + rsol::Vector{T} # length n+r — receives the in-place solve +end + +Base.size(F::SparseWithDenseRowColQRAugmented) = (F.n, F.n) +Base.size(F::SparseWithDenseRowColQRAugmented, i::Integer) = i ≤ 2 ? F.n : 1 +LinearAlgebra.issuccess(F::SparseWithDenseRowColQRAugmented) = F.rankaug == F.n + F.r +denserank(F::SparseWithDenseRowColQRAugmented) = F.r + +_bordered_csr(MaugC::SparseMatrixCSC) = SparseMatrixCSR(MaugC) +# CSR of Mᴴ / Mᵀ (for the adjoint/transpose factorization), built from the owned CSC. +_adjoint_csr(MaugC::SparseMatrixCSC{T}) where {T} = _bordered_csr(SparseMatrixCSC{T, Int}(MaugC')) +_transpose_csr(MaugC::SparseMatrixCSC{T}) where {T} = _bordered_csr(SparseMatrixCSC{T, Int}(transpose(MaugC))) + +function _augmented_qr(A::SparseWithDenseRowColMatrix{T}) where {T} + n = size(A, 1) + r = denserank(A) + MaugC = SparseMatrixCSC{T, Int}(_augmented_matrix(A)) + Qaug = SparseColumnPivotedQR.csr_qr(_bordered_csr(MaugC)) + rankaug = LinearAlgebra.rank(Qaug) + rankaug == n + r || throw(SingularException(0)) # A singular ⇒ bordered system singular + return SparseWithDenseRowColQRAugmented{T, typeof(Qaug)}( + Qaug, nothing, MaugC, n, r, rankaug, + Vector{T}(undef, n + r), Vector{T}(undef, n + r) + ) +end + +# Fill `F.rhs = [b; 0]`, solve in place into `F.rsol` with `fact`, copy the leading `n` back. +@inline function _qr_aug_solve!(F::SparseWithDenseRowColQRAugmented{T}, fact, b::AbstractVector) where {T} + @inbounds copyto!(view(F.rhs, 1:F.n), b) + @inbounds for i in (F.n + 1):(F.n + F.r) + F.rhs[i] = zero(T) + end + LinearAlgebra.ldiv!(F.rsol, fact, F.rhs) # allocation-free in-place solve + @inbounds copyto!(b, view(F.rsol, 1:F.n)) + return b +end + +function LinearAlgebra.ldiv!(F::SparseWithDenseRowColQRAugmented, b::AbstractVector) + length(b) == F.n || throw(DimensionMismatch()) + return _qr_aug_solve!(F, F.Qaug, b) +end + +function LinearAlgebra.ldiv!(F::SparseWithDenseRowColQRAugmented, B::AbstractMatrix) + size(B, 1) == F.n || throw(DimensionMismatch()) + for c in axes(B, 2) + ldiv!(F, view(B, :, c)) + end + return B +end + +LinearAlgebra.ldiv!(y::AbstractVector, F::SparseWithDenseRowColQRAugmented, b::AbstractVector) = + ldiv!(F, copyto!(y, b)) +LinearAlgebra.ldiv!(Y::AbstractMatrix, F::SparseWithDenseRowColQRAugmented, B::AbstractMatrix) = + ldiv!(F, copyto!(Y, B)) + +# Adjoint / transpose solve. The bordered matrix Mᴴ = [Sᴴ Vᴴ; Uᴴ -I] has Schur complement +# Sᴴ + Vᴴ Uᴴ = (S + U V)ᴴ = Aᴴ (and Mᵀ ↔ Aᵀ), so solving Mᴴ[x;y]=[b;0] gives x = A⁻ᴴ b. The +# column-pivoted QR has no `Qaug' \ b`, so we build a SEPARATE factorization of the adjoint and +# cache it lazily (it is invalidated on every `refactor!`). +function _augfact_adj!(F::SparseWithDenseRowColQRAugmented) + F.QaugH === nothing && (F.QaugH = SparseColumnPivotedQR.csr_qr(_adjoint_csr(F.MaugC))) + return F.QaugH +end +# For real T, transpose == adjoint, so reuse the cached factorization; for complex T the +# (rare) transpose path rebuilds qr(transpose(Maug)) each call rather than holding a 3rd cache. +_augfact_tr!(F::SparseWithDenseRowColQRAugmented{<:Real}) = _augfact_adj!(F) +_augfact_tr!(F::SparseWithDenseRowColQRAugmented) = SparseColumnPivotedQR.csr_qr(_transpose_csr(F.MaugC)) + +for (Wrap, getfact) in ((AdjointFact, :_augfact_adj!), (TransposeFact, :_augfact_tr!)) + @eval function LinearAlgebra.ldiv!( + Fw::$Wrap{<:Any, <:SparseWithDenseRowColQRAugmented}, b::AbstractVector + ) + F = parent(Fw) + length(b) == F.n || throw(DimensionMismatch()) + return _qr_aug_solve!(F, $getfact(F), b) + end + @eval function LinearAlgebra.ldiv!(Fw::$Wrap{<:Any, <:SparseWithDenseRowColQRAugmented}, B::AbstractMatrix) + for c in axes(B, 2) + ldiv!(Fw, view(B, :, c)) + end + return B + end +end + +# Complex RHS over a real augmented-QR factorization (real/imag split; see woodbury.jl). +LinearAlgebra.ldiv!(F::SparseWithDenseRowColQRAugmented{<:Real}, b::AbstractVector{<:Complex}) = + _ldiv_realfact_complex!(F, b) +LinearAlgebra.ldiv!(Fw::AdjointFact{<:Any, <:SparseWithDenseRowColQRAugmented{<:Real}}, b::AbstractVector{<:Complex}) = + _ldiv_realfact_complex!(Fw, b) +LinearAlgebra.ldiv!(Fw::TransposeFact{<:Any, <:SparseWithDenseRowColQRAugmented{<:Real}}, b::AbstractVector{<:Complex}) = + _ldiv_realfact_complex!(Fw, b) + +""" + refactor!(F::SparseWithDenseRowColQRAugmented, A::SparseWithDenseRowColMatrix; check=true) -> F + +Rebuild the augmented system with the new values of `A` and re-factor in place. When the +bordered sparsity pattern is unchanged this **reuses the symbolic analysis** (column ordering, +elimination tree) and the preallocated numeric workspace — far cheaper than a fresh +factorization — making the QR path usable in a Newton / time-stepping hot loop, not just for +one-off stable solves. (The re-factorization kernel itself is allocation-free; this wrapper +still rebuilds the bordered matrix each call, so it is not yet fully zero-allocation.) The shape +(`n`, `r`) must be unchanged. +""" +function refactor!(F::SparseWithDenseRowColQRAugmented{T}, A::SparseWithDenseRowColMatrix; check::Bool = true) where {T} + denserank(A) == F.r && size(A, 1) == F.n || + throw(DimensionMismatch("shape changed; rebuild with `qr`.")) + F.MaugC = SparseMatrixCSC{T, Int}(_augmented_matrix(A)) + SparseColumnPivotedQR.csr_refactor!(F.Qaug, _bordered_csr(F.MaugC)) # reuses the symbolic analysis + F.QaugH = nothing # invalidate cached adjoint + F.rankaug = LinearAlgebra.rank(F.Qaug) + check && (F.rankaug == F.n + F.r || throw(SingularException(0))) + return F +end + +# The augmented-QR path embeds U/V inside one factorization and keeps no separate factorization +# of S to reuse, so the Woodbury low-rank fast path does not apply here (same as augmented LU). +function update_lowrank!(::SparseWithDenseRowColQRAugmented; kwargs...) + throw(ArgumentError("update_lowrank! is the Woodbury fast path (it reuses S's factorization); \ + the augmented QR path embeds U/V in one factorization and has no separate S factorization \ + to reuse — update with `refactor!` or rebuild with `qr`.")) +end diff --git a/test/runtests.jl b/test/runtests.jl index fc11674..e2abce2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,6 +13,12 @@ using SafeTestsets, Test @safetestset "Augmented fallback" begin include("test_augmented.jl") end + @safetestset "QR factorization (augmented, stable)" begin + include("test_qr.jl") + end + @safetestset "Rank-deficient least squares (min-norm A⁺b)" begin + include("test_lstsq.jl") + end @safetestset "Refactorization (Newton path)" begin include("test_refactor.jl") end @@ -38,6 +44,17 @@ using SafeTestsets, Test # structured matrices (`Diagonal`/`Bidiagonal`/`Tridiagonal`/`Triangular`) in # `*`/`mul!` — combinations that never arise in use, and where both candidate # methods compute the same product anyway. - Aqua.test_all(SparseWithDenseRowColMatrices; ambiguities = false, piracies = true) + # + # `persistent_tasks` uses a generous `tmax`: neither this package nor PureKLU spawns any + # task at module load (verified — no `@async`/`Threads.@spawn`/`Timer`/`__init__`; the + # load subprocess exits in ~0.4s), so the check has nothing real to catch. Its default + # 10s wall-clock budget for the spawned subprocess to *exit* is, however, flaky on a + # loaded CI host (heavier first-load on 1.12), producing a false positive under + # precompile contention. The larger budget removes that flake while still flagging a + # genuinely hung load. + Aqua.test_all( + SparseWithDenseRowColMatrices; + ambiguities = false, piracies = true, persistent_tasks = (; tmax = 60), + ) end end diff --git a/test/test_edgecases.jl b/test/test_edgecases.jl index c4f3e38..373e727 100644 --- a/test/test_edgecases.jl +++ b/test/test_edgecases.jl @@ -91,7 +91,10 @@ end @test_throws SingularException factorize(Asing) end -@testset "qr is intentionally unsupported" begin +@testset "qr returns the augmented QR factorization" begin A = rand_sparsedense(20, 2) - @test_throws ArgumentError qr(A) + b = randn(20) + F = qr(A) + @test F isa SparseWithDenseRowColQRAugmented + @test F \ b ≈ Matrix(A) \ b end diff --git a/test/test_linearsolve.jl b/test/test_linearsolve.jl index 5b879e8..5e11817 100644 --- a/test/test_linearsolve.jl +++ b/test/test_linearsolve.jl @@ -65,6 +65,58 @@ end @test solve!(cache).retcode == ReturnCode.Infeasible end +@testset "QR algorithm: solve, cache reuse, and Infeasible parity" begin + A = rand_sparsedense(n, r; seed = 41) + b = randn(n) + cache = init(LinearProblem(A, b), SparseWithDenseRowColQRFactorization()) + s1 = solve!(cache) + @test s1.retcode == ReturnCode.Success + @test s1.u ≈ Matrix(A) \ b + + # new values, same pattern → refactor! re-qrs in the cache + A2 = SparseWithDenseRowColMatrix( + SparseMatrixCSC( + size(A)..., copy(sparsepart(A).colptr), copy(sparsepart(A).rowval), + sparsepart(A).nzval .* 1.5 .+ 0.01 + ), + lowrankfactors(A)[1], Matrix(fillpart(A)) .+ 0.2 + ) + cache.A = A2 + @test solve!(cache).u ≈ Matrix(A2) \ b + # new RHS only + b2 = randn(n) + cache.b = b2 + @test solve!(cache).u ≈ Matrix(A2) \ b2 + + # singular A → Infeasible, no throw (parity with the LU algorithm) + nn = 20 + S = sparse(2.0 * I, nn, nn) + S[5, 5] = 0.0 + Asing = SparseWithDenseRowColMatrix(S, zeros(nn, 1), zeros(1, nn)) + sol = solve(LinearProblem(Asing, randn(nn)), SparseWithDenseRowColQRFactorization()) + @test sol.retcode == ReturnCode.Infeasible +end + +@testset "QR algorithm stays accurate on ill-conditioned S" begin + # Where the default (Woodbury) algorithm would lose accuracy, the opt-in QR algorithm holds. + nn, rr = 120, 3 + S = spdiagm(0 => fill(4.0, nn), 1 => fill(-1.0, nn - 1), -1 => fill(-1.0, nn - 1)) + for i in 1:rr + S[i, :] .*= 1.0e-13 + end + dropzeros!(S) + Fdense = zeros(rr, nn) + for i in 1:rr + Fdense[i, i] = 1.0; Fdense[i, i + rr] = 0.3 + end + A = SparseWithDenseRowColMatrix(S, Fdense; replace = true) + b = randn(nn) + xref = Float64.(big.(Matrix(A)) \ big.(b)) + sol = solve(LinearProblem(A, b), SparseWithDenseRowColQRFactorization()) + @test sol.retcode == ReturnCode.Success + @test norm(sol.u .- xref) / norm(xref) < 1.0e-12 +end + @testset "singular S routes through the augmented fallback under caching" begin # S singular (zeroed top rows), A nonsingular via the dense fill rows nn, rr = 50, 2 diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl new file mode 100644 index 0000000..31a929e --- /dev/null +++ b/test/test_lstsq.jl @@ -0,0 +1,285 @@ +include("testutils.jl") +using Test, LinearAlgebra, SparseArrays +import IterativeSolvers + +# A = S + U V with EXACTLY rank n-1: zero column/row n of S and zero column n of V make column +# n of A identically zero, so A is genuinely rank-deficient while keeping the S+UV structure. +function rankdef(n, r; seed = 1, T = Float64, selector = false) + rng = MersenneTwister(seed) + S = spdiagm(0 => fill(T(4), n), 1 => fill(T(-1), n - 1), -1 => fill(T(-1), n - 1)) + S[:, n] .= zero(T); S[n, :] .= zero(T); dropzeros!(S) + U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r)) + V = T.(randn(rng, r, n)); V[:, n] .= zero(T) + return SparseWithDenseRowColMatrix(S, U, V) +end + +minnorm(M, b) = pinv(M) * b # the oracle: minimum-norm least-squares solution A⁺b + +# A = S + U V with S NONSINGULAR and A rank n-k: choose V so the r×r capacitance C = I + V S⁻¹U +# has nullity k (det A = det S · det C). This is the regime the structured-direct engine targets. +function rankdef_nonsingS(n, r, k; seed = 1, T = Float64, selector = false) + rng = MersenneTwister(seed) + S = spdiagm(-1 => fill(T(-1), n - 1), 0 => fill(T(4), n), 1 => fill(T(-1), n - 1)) + U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r)) + Z = Matrix(S) \ Matrix(U) + Q = Matrix(qr(T <: Complex ? randn(rng, T, r, r) : randn(rng, r, r)).Q) + d = ones(T, r) + for i in 1:k + d[i] = zero(T) + end + V = T.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z)) + return SparseWithDenseRowColMatrix(S, U, V) +end + +@testset "adjoint / transpose matvec (core, structured Sᴴu + Vᴴ(Uᴴu))" begin + for (lbl, A) in ( + ("dense-U", rankdef(70, 3; seed = 1)), + ("selector-U", rankdef(70, 3; seed = 2, selector = true)), + ) + M = Matrix(A); u = randn(70) + @test A' * u ≈ M' * u + @test transpose(A) * u ≈ transpose(M) * u + @test mul!(similar(u), A', u) ≈ M' * u + y = randn(70); ref = 2.0 .* (M' * u) .+ 3.0 .* y # 5-arg in-place + mul!(y, A', u, 2.0, 3.0) + @test y ≈ ref + end + # complex: adjoint (conjugating) ≠ transpose + rng = MersenneTwister(9); n, r = 60, 2 + S = spdiagm(0 => fill(ComplexF64(4), n), 1 => fill(ComplexF64(-1), n - 1), -1 => fill(ComplexF64(-1), n - 1)) + Ac = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), randn(rng, ComplexF64, r, n)) + Mc = Matrix(Ac); u = randn(ComplexF64, n) + @test Ac' * u ≈ Mc' * u + @test transpose(Ac) * u ≈ transpose(Mc) * u + @test !(Ac' * u ≈ transpose(Ac) * u) # genuinely different for complex +end + +@testset "dense engine: minimum-norm LS on rank-deficient A (vs pinv oracle)" begin + for seed in 1:4 + A = rankdef(80, 3; seed = seed) + M = Matrix(A) + @test rank(M) == 79 # genuinely rank-deficient + N = nullspace(M) # n × 1 here + for (kind, b) in ( + (:consistent, M * randn(80)), + (:inconsistent, randn(80)), + ) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :dense) + @test relerr(x, xref) < 1.0e-10 + @test norm(x) ≈ norm(xref) rtol = 1.0e-9 # minimum-norm + @test norm(M' * (M * x - b)) < 1.0e-8 * norm(M) # least-squares optimality + @test norm(N' * x) < 1.0e-9 # no null-space component + kind === :consistent && @test norm(M * x - b) < 1.0e-9 + end + end +end + +@testset "structured direct engine: min-norm LS without densifying (S nonsingular)" begin + for seed in 1:3, (n, r, k) in ((60, 3, 1), (90, 5, 2), (120, 8, 3)), sel in (false, true) + A = rankdef_nonsingS(n, r, k; seed = seed, selector = sel) + M = Matrix(A) + @test rank(M) == n - k + N = nullspace(M) + for (kind, b) in ((:consistent, M * randn(n)), (:inconsistent, randn(n))) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :structured) + @test relerr(x, xref) < 1.0e-9 + @test norm(x) ≈ norm(xref) rtol = 1.0e-8 # minimum-norm + @test norm(M' * (M * x - b)) < 1.0e-7 * norm(M) # least-squares optimality + @test norm(N' * x) < 1.0e-8 # no null-space component + kind === :consistent && @test norm(M * x - b) < 1.0e-8 + end + end +end + +@testset "structured engine: complex" begin + n, r, k = 70, 3, 1 + rng = MersenneTwister(5) + S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1), 1 => fill(ComplexF64(-1), n - 1)) + U = randn(rng, ComplexF64, n, r); Z = Matrix(S) \ U + Q = Matrix(qr(randn(rng, ComplexF64, r, r)).Q); d = ones(r); d[1] = 0 + V = ComplexF64.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z)) + A = SparseWithDenseRowColMatrix(S, U, V); M = Matrix(A) + b = randn(ComplexF64, n) + @test relerr(lstsq(A, b; alg = :structured), minnorm(M, b)) < 1.0e-9 +end + +@testset "structured engine peels a selector-singular S (BVP boundary case)" begin + # S has its boundary rows zeroed (singular); U is a SelectorMatrix supplying them. The sparse + # peel S̃ = S + U Uᴴ (r diagonal entries) keeps the structured DIRECT path — no dense fallback — + # and stays exact, for A nonsingular AND rank-deficient, consistent AND inconsistent. + for seed in 1:3, rde in (false, true) + n, r = 100, 3 + S = spdiagm(0 => fill(3.0, n), 1 => fill(-1.0, n - 1), -1 => fill(-1.0, n - 1)) + S[1:r, :] .= 0.0; dropzeros!(S) + F = zeros(r, n) + for i in 1:r + F[i, i] = 1.0; F[i, i + r] = 0.3 + end + rde && (F[r, :] .= F[1, :]) # make A itself rank-deficient + A = SparseWithDenseRowColMatrix(S, F; replace = true) + M = Matrix(A); N = nullspace(M) + @test rank(M) == n - (rde ? 1 : 0) + for (kind, b) in ((:cons, M * randn(n)), (:incons, randn(n))) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :structured) # must NOT throw — the peel succeeds + @test relerr(x, xref) < 1.0e-9 + @test norm(x) ≈ norm(xref) rtol = 1.0e-8 + isempty(N) || @test norm(N' * x) < 1.0e-8 + end + end +end + +@testset ":auto uses structured on nonsingular S, falls back on singular/near-singular S" begin + # nonsingular S: :auto path agrees with explicit :structured and :dense, all = pinv + A = rankdef_nonsingS(100, 4, 2; seed = 1) + M = Matrix(A); b = randn(100); xref = minnorm(M, b) + @test relerr(lstsq(A, b), xref) < 1.0e-9 # :auto default + @test relerr(lstsq(A, b; alg = :auto), xref) < 1.0e-9 + @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :structured) + @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :dense) + + # exactly-singular S (zeroed row): structured klu throws → :auto falls back to dense + Asing = rankdef(80, 3; seed = 2) + Msing = Matrix(Asing); bs = randn(80) + @test relerr(lstsq(Asing, bs; alg = :auto), minnorm(Msing, bs)) < 1.0e-9 + @test_throws ArgumentError lstsq(Asing, bs; alg = :structured) + + # near-singular S (κ(S) ≈ 5e13, but A well-conditioned): structured would be silently wrong; + # :auto must fall back to the exact dense COD and :structured must error (not return garbage) + n = 80; rng = MersenneTwister(3) + Snear = spdiagm(0 => collect(exp.(range(0, -log(4.7e13); length = n))), -1 => fill(-1.0e-3, n - 1)) + Anear = SparseWithDenseRowColMatrix(Snear, randn(rng, n, 2), randn(rng, 2, n)) + Mnear = Matrix(Anear); bn = randn(n) + @test cond(Mnear) < 1.0e8 # A itself is well-conditioned + @test relerr(lstsq(Anear, bn; alg = :auto), minnorm(Mnear, bn)) < 1.0e-7 + @test_throws ArgumentError lstsq(Anear, bn; alg = :structured) +end + +@testset "structured rejects Tikhonov λ" begin + A = rankdef_nonsingS(40, 2, 1; seed = 1) + @test_throws ArgumentError lstsq(A, randn(40); alg = :structured, λ = 0.1) +end + +@testset "iterative engine recovers the same minimum-norm solution" begin + for seed in 1:3, solver in (:lsqr, :lsmr) + A = rankdef(80, 3; seed = seed) + M = Matrix(A) + b = seed == 1 ? M * randn(80) : randn(80) + xref = minnorm(M, b) + x = lstsq(A, b; alg = :iterative, solver = solver, atol = 1.0e-13, btol = 1.0e-13) + @test relerr(x, xref) < 1.0e-6 + @test norm(x) ≈ norm(xref) rtol = 1.0e-5 + @test relerr(x, lstsq(A, b)) < 1.0e-6 # agrees with the dense engine + end +end + +@testset "SelectorMatrix-U (boundary-condition form)" begin + A = rankdef(80, 2; seed = 5, selector = true) + M = Matrix(A); b = randn(80) + xref = minnorm(M, b) + @test relerr(lstsq(A, b), xref) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), xref) < 1.0e-6 +end + +@testset "full-rank square reduces to the ordinary solve" begin + A = rand_sparsedense(60, 3; seed = 11) + M = Matrix(A); b = randn(60) + @test relerr(lstsq(A, b), M \ b) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), M \ b) < 1.0e-6 +end + +@testset "complex rank-deficient LS" begin + rng = MersenneTwister(21); n, r = 70, 2 + S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1)) + S[:, n] .= 0; S[n, :] .= 0; dropzeros!(S) + V = randn(rng, ComplexF64, r, n); V[:, n] .= 0 + A = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), V) + M = Matrix(A); b = randn(ComplexF64, n) + @test rank(M) == n - 1 + @test relerr(lstsq(A, b), minnorm(M, b)) < 1.0e-10 + @test relerr(lstsq(A, b; alg = :iterative), minnorm(M, b)) < 1.0e-6 +end + +@testset "Tikhonov damping λ > 0 (both engines)" begin + A = rankdef(60, 2; seed = 7) + M = Matrix(A); b = randn(60); λ = 0.1 + xref = (M' * M + λ^2 * I) \ (M' * b) # the damped normal-equations solution + @test relerr(lstsq(A, b; λ = λ), xref) < 1.0e-9 + @test relerr(lstsq(A, b; alg = :iterative, λ = λ, atol = 1.0e-13, btol = 1.0e-13), xref) < 1.0e-6 +end + +@testset "cached factorization (SparseWithDenseRowColLeastSquares) reuses S for repeated solves" begin + for sel in (false, true) + A = rankdef_nonsingS(100, 4, 2; seed = 1, selector = sel) + M = Matrix(A) + F = SparseWithDenseRowColLeastSquares(A) + @test denserank(F) == 4 + @test size(F) == (100, 100) + for b in (M * randn(100), randn(100)) # consistent, inconsistent + xref = minnorm(M, b) + @test relerr(F \ b, xref) < 1.0e-9 + x = copy(b); ldiv!(F, x) # in-place (alias-safe) + @test relerr(x, xref) < 1.0e-9 + y = similar(b); ldiv!(y, F, b) # 3-arg + @test relerr(y, xref) < 1.0e-9 + end + end + # the cached factorization agrees with the one-shot lstsq on the SAME rhs + A = rankdef_nonsingS(80, 3, 1; seed = 2) + b = randn(80) + @test SparseWithDenseRowColLeastSquares(A) \ b ≈ lstsq(A, b; alg = :structured) + # selector-singular S (BVP): the cached factorization peels and works + n, r = 80, 3 + S = spdiagm(0 => fill(3.0, n), 1 => fill(-1.0, n - 1), -1 => fill(-1.0, n - 1)) + S[1:r, :] .= 0.0; dropzeros!(S) + Fb = zeros(r, n); for i in 1:r + Fb[i, i] = 1.0; Fb[i, i + r] = 0.3 + end + Abvp = SparseWithDenseRowColMatrix(S, Fb; replace = true); Mbvp = Matrix(Abvp); bb = randn(n) + @test relerr(SparseWithDenseRowColLeastSquares(Abvp) \ bb, minnorm(Mbvp, bb)) < 1.0e-9 + # general singular S: the constructor errors (points the user to alg=:dense) + @test_throws ArgumentError SparseWithDenseRowColLeastSquares(rankdef(40, 2; seed = 3)) +end + +@testset "lstsq is a separate opt-in: \\ / factorize / qr still throw on singular A" begin + A = rankdef(50, 2; seed = 3) + b = randn(50) + @test_throws SingularException factorize(A; strategy = :augmented) + @test_throws SingularException qr(A) + @test_throws SingularException (A \ b) + @test relerr(lstsq(A, b), minnorm(Matrix(A), b)) < 1.0e-10 # but lstsq succeeds +end + +@testset "eltype guard and argument validation" begin + A = rankdef(20, 2; seed = 1, T = BigFloat) # gelsy / BLAS-iterative cannot do BigFloat + @test_throws ArgumentError lstsq(A, randn(BigFloat, 20)) + # non-BLAS-float (Int) must be rejected, not silently promoted to a Float64 answer + Si = spdiagm(0 => fill(4, 20), 1 => fill(-1, 19), -1 => fill(-1, 19)) + Ui = zeros(Int, 20, 2); Ui[1, 1] = 1; Ui[2, 2] = 1 + Vi = zeros(Int, 2, 20); Vi[1, 1] = 1; Vi[2, 2] = 1 + Ai = SparseWithDenseRowColMatrix(Si, Ui, Vi) + @test_throws ArgumentError lstsq(Ai, ones(Int, 20)) + @test_throws ArgumentError lstsq(Ai, ones(Int, 20); alg = :iterative) + + Af = rankdef(20, 2; seed = 1) + bf = randn(20) + @test_throws ArgumentError lstsq(Af, bf; alg = :bogus) + @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, solver = :bogus) + @test_throws DimensionMismatch lstsq(Af, randn(19)) + # negative Tikhonov damping rejected by BOTH engines (dense must not silently use |λ|) + @test_throws ArgumentError lstsq(Af, bf; λ = -1.0) + @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, λ = -1.0) +end + +@testset "iterative non-convergence warns (isconverged is unreliable for lsqr)" begin + A = rankdef(60, 3; seed = 4) + b = randn(60) + # a far-too-small budget must WARN for both solvers (the bug was lsqr silently returning a + # wrong answer with isconverged=true; we now flag on budget exhaustion) + @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsqr, maxiters = 2) + @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsmr, maxiters = 2) + # an ample budget converges, so the result is accurate (and the warn branch is not taken) + @test relerr(lstsq(A, b; alg = :iterative, maxiters = 20 * 60), minnorm(Matrix(A), b)) < 1.0e-6 +end diff --git a/test/test_qr.jl b/test/test_qr.jl new file mode 100644 index 0000000..6f43df3 --- /dev/null +++ b/test/test_qr.jl @@ -0,0 +1,236 @@ +include("testutils.jl") +using Test, LinearAlgebra, SparseArrays + +# A = S + U V with an ILL-conditioned sparse part S (κ(S) ≈ `kappa`) but a well-conditioned +# assembled A (κ(A) ≈ 6): the top `r` rows of the tridiagonal S are scaled by 1/kappa (making +# S nearly rank-deficient there) and the dense fill replaces exactly those rows with O(1) +# entries. This is the FastAlmostBandedMatrices regime where forming S⁻¹ (Woodbury) loses +# accuracy but the augmented QR — which never forms S⁻¹ — does not. +function illcond_S(n, r, kappa; T = Float64) + S = spdiagm(0 => fill(T(4), n), 1 => fill(T(-1), n - 1), -1 => fill(T(-1), n - 1)) + for i in 1:r + S[i, :] .*= inv(T(kappa)) + end + dropzeros!(S) + F = zeros(T, r, n) + for i in 1:r + F[i, i] = one(T) + F[i, i + r] = T(0.3) + end + return SparseWithDenseRowColMatrix(S, F; replace = true) +end + +# Exact solve of the (rounded) working-precision matrix, computed in BigFloat — the reference +# every forward-error assertion compares against (matches how the design was measured). +function exact_solve(A, b) + x = big.(Matrix(A)) \ big.(b) + return convert.(eltype(b), x) +end + + +@testset "well-conditioned: qr agrees with factorize at the noise floor" begin + A = rand_sparsedense(300, 4; seed = 7) + b = randn(300) + xref = exact_solve(A, b) + F = qr(A) + @test F isa SparseWithDenseRowColQRAugmented + @test issuccess(F) + @test size(F) == (300, 300) + @test denserank(F) == 4 + @test relerr(F \ b, xref) < 1.0e-12 + @test relerr(factorize(A) \ b, xref) < 1.0e-10 # both at noise floor +end + +@testset "ill-conditioned S: augmented QR holds where Woodbury degrades" begin + n, r = 200, 4 + b = randn(MersenneTwister(0), n) + # The load-bearing regression guard: across the entire κ(S) sweep — including κ(S) ≥ 3.6e11 + # where Woodbury (even with refinement) loses accuracy — the augmented QR stays ≤ 1e-13. + for kappa in (3.6e6, 3.6e10, 3.6e11, 3.6e12, 3.6e14) + A = illcond_S(n, r, kappa) + xref = exact_solve(A, b) + F = qr(A) + @test issuccess(F) + @test relerr(F \ b, xref) < 1.0e-13 + end + # And at the top of the sweep the QR is decisively better than the default Woodbury path. + A = illcond_S(n, r, 3.6e14) + xref = exact_solve(A, b) + err_qr = relerr(qr(A) \ b, xref) + err_wood = relerr(factorize(A; strategy = :woodbury, auto_fallback = false, refine = 2) \ b, xref) + @test err_qr < 1.0e-13 + @test err_wood > 1.0e3 * err_qr +end + +@testset "strategy handling and no Woodbury-QR footgun" begin + A = rand_sparsedense(40, 2; seed = 2) + @test qr(A; strategy = :auto) isa SparseWithDenseRowColQRAugmented + @test qr(A; strategy = :augmented) isa SparseWithDenseRowColQRAugmented + @test_throws ArgumentError qr(A; strategy = :woodbury) + @test_throws ArgumentError qr(A; strategy = :bogus) + # The Woodbury-over-qr(S) mode is intentionally not shipped (shares the κ(S)·κ(C) + # cancellation): there must be no such type to silently re-introduce it. + @test !isdefined(SparseWithDenseRowColMatrices, :SparseWithDenseRowColQR) +end + +@testset "eltypes: BLAS floats, complex, and BigFloat all work" begin + # The column-pivoted backend is pure Julia, so the QR path is no longer BLAS-float-only: + # Float32/ComplexF32 work on every Julia version, and generic floats (BigFloat) work too. + for (T, tol) in ( + (Float32, 1.0e-4), (Float64, 1.0e-11), (ComplexF32, 1.0e-4), + (ComplexF64, 1.0e-11), (BigFloat, 1.0e-30), + ) + A = rand_sparsedense(50, 3; seed = 4, T = T) + b = T <: Complex ? (randn(50) .+ im .* randn(50)) .|> T : T.(randn(50)) + F = qr(A) + @test F isa SparseWithDenseRowColQRAugmented{T} + @test relerr(F \ b, exact_solve(A, b)) < tol + end + # Non-float eltypes that cannot support a QR (no sqrt/division) still error. + nn = 20 + Si = spdiagm(0 => fill(4, nn), 1 => fill(-1, nn - 1), -1 => fill(-1, nn - 1)) + Ui = zeros(Int, nn, 2); Ui[1, 1] = 1; Ui[2, 2] = 1 + Vi = zeros(Int, 2, nn); Vi[1, 1] = 1; Vi[2, 2] = 1 + @test_throws Exception qr(SparseWithDenseRowColMatrix(Si, Ui, Vi)) +end + +@testset "adjoint & transpose solve (lazy cached QaugH)" begin + A = rand_sparsedense(120, 3; seed = 11) + M = Matrix(A) + b = randn(120) + F = qr(A) + @test F.QaugH === nothing # not built until first adjoint/transpose solve + @test relerr(F' \ b, M' \ b) < 1.0e-11 + @test F.QaugH !== nothing # lazily built (a CSRQRFactorization) + cached = F.QaugH + @test relerr(F' \ b, M' \ b) < 1.0e-11 + @test F.QaugH === cached # second adjoint solve reuses it (no rebuild) + @test relerr(transpose(F) \ b, transpose(M) \ b) < 1.0e-11 + x = copy(b) + ldiv!(F', x) + @test relerr(x, M' \ b) < 1.0e-11 + # complex A: adjoint ≠ transpose + Ac = rand_sparsedense(80, 2; seed = 12, T = ComplexF64) + Mc = Matrix(Ac) + bc = randn(ComplexF64, 80) + Fc = qr(Ac) + @test relerr(Fc' \ bc, Mc' \ bc) < 1.0e-10 + @test relerr(transpose(Fc) \ bc, transpose(Mc) \ bc) < 1.0e-10 +end + +@testset "complex RHS over a real factorization" begin + A = rand_sparsedense(60, 2; seed = 13) + M = Matrix(A) + b = randn(60) .+ im .* randn(60) + F = qr(A) + @test relerr(F \ b, M \ b) < 1.0e-11 + @test relerr(F' \ b, M' \ b) < 1.0e-11 +end + +@testset "matrix RHS and 3-arg ldiv!" begin + A = rand_sparsedense(90, 3; seed = 14) + M = Matrix(A) + B = randn(90, 5) + F = qr(A) + @test relerr(F \ B, M \ B) < 1.0e-11 + Y = similar(B) + ldiv!(Y, F, B) + @test relerr(Y, M \ B) < 1.0e-11 +end + +@testset "refactor! / qr! (reuses symbolic analysis)" begin + A1 = rand_sparsedense(100, 3; seed = 21) + A2 = rand_sparsedense(100, 3; seed = 22) # same pattern, new values + b = randn(100) + F = qr(A1) + _ = F' \ b # force QaugH to be built + @test F.QaugH !== nothing # lazily built (a CSRQRFactorization) + refactor!(F, A2) + @test F.QaugH === nothing # adjoint cache invalidated on refactor + @test relerr(F \ b, Matrix(A2) \ b) < 1.0e-11 + # qr! is an alias for refactor! + A3 = rand_sparsedense(100, 3; seed = 23) + qr!(F, A3) + @test relerr(F \ b, Matrix(A3) \ b) < 1.0e-11 + # a shape change must be rejected + Abig = rand_sparsedense(101, 3; seed = 24) + @test_throws DimensionMismatch refactor!(F, Abig) + Arank = rand_sparsedense(100, 4; seed = 25) + @test_throws DimensionMismatch refactor!(F, Arank) +end + +@testset "singularity: singular A throws, ill-conditioned S succeeds" begin + # A genuinely singular A (a zero column the dense fill does not supply) ⇒ the bordered + # system is rank-deficient ⇒ SingularException, matching the LU path's contract. + n, r = 40, 2 + S = spdiagm(0 => fill(3.0, n), -1 => fill(-1.0, n - 1)) + S[:, n] .= 0.0 + S[n, :] .= 0.0 + dropzeros!(S) + F = zeros(r, n) + for i in 1:r + F[i, i] = 1.0 # dense rows do not touch column n ⇒ A column n is all zero + end + Asing = SparseWithDenseRowColMatrix(S, F; replace = true) + @test_throws SingularException qr(Asing) + @test_throws SingularException factorize(Asing; strategy = :augmented) # LU agrees + + # Ill-conditioned S but nonsingular A must SUCCEED (the whole point of the path). + Aok = illcond_S(80, 3, 3.6e13) + Fok = qr(Aok) + @test issuccess(Fok) + b = randn(80) + @test relerr(Fok \ b, exact_solve(Aok, b)) < 1.0e-13 +end + +@testset "ill-conditioned ASSEMBLED A does not spuriously throw" begin + # Regression: a rank-revealing QR must not flag an ill-conditioned-but-nonsingular bordered + # system as rank-deficient. An assembled A up to cond ~1e13 must factor and solve to roughly + # its conditioning-limited accuracy (κ(A)·eps), matching the LU path's behavior. + function illcond_assembled(n; kappa) + d = collect(exp.(range(0, -log(kappa); length = n))) # graded 1 .. 1/kappa + S = spdiagm(0 => d) + U = zeros(n, 1); U[1, 1] = 1.0e-8 + V = zeros(1, n); V[1, 1] = 1.0e-8 # negligible low-rank part + return SparseWithDenseRowColMatrix(S, U, V) + end + b = randn(MersenneTwister(0), 120) + for kappa in (1.0e10, 1.0e12, 1.0e13) + A = illcond_assembled(120; kappa = kappa) + F = qr(A) # must NOT throw + @test issuccess(F) + xref = Float64.(big.(Matrix(A)) \ big.(b)) + # backward-stable: forward error tracks κ(A)·eps, not the noise floor + @test relerr(F \ b, xref) < 1.0e3 * cond(Matrix(A)) * eps() + # the augmented-LU path also solves these (contract parity — neither throws) + @test issuccess(factorize(A; strategy = :augmented)) + end +end + +@testset "update_lowrank! is not supported on the QR path" begin + A = rand_sparsedense(30, 2; seed = 31) + F = qr(A) + @test_throws ArgumentError update_lowrank!(F) +end + +@testset "solve is allocation-free" begin + A = rand_sparsedense(200, 4; seed = 41) + b = randn(200) + F = qr(A) + x = copy(b); ldiv!(F, x) # warm + y = copy(b) + @test (@allocated ldiv!(F, y)) == 0 # in-place solve allocates nothing +end + +@testset "r == 0 edge case reduces to qr(S)" begin + n = 50 + S = spdiagm(0 => fill(4.0, n), 1 => fill(-1.0, n - 1), -1 => fill(-1.0, n - 1)) + A = SparseWithDenseRowColMatrix(S, zeros(0, n)) # no dense rows/cols + @test denserank(A) == 0 + F = qr(A) + @test denserank(F) == 0 + @test issuccess(F) + b = randn(n) + @test relerr(F \ b, Matrix(S) \ b) < 1.0e-11 + @test relerr(F' \ b, Matrix(S)' \ b) < 1.0e-11 +end