From 8c8762c7764ed8eb932f1433a93546e661aee0b3 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Sat, 30 May 2026 19:40:08 -0400
Subject: [PATCH 1/3] =?UTF-8?q?Add=20lstsq:=20minimum-norm=20least=20squar?=
 =?UTF-8?q?es=20(A=E2=81=BAb)=20for=20rank-deficient=20/=20inconsistent=20?=
 =?UTF-8?q?systems?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`lstsq(A, b)` returns the Moore–Penrose minimum-norm least-squares solution, correct
for rank-deficient or inconsistent A where `\`/`factorize`/`qr` throw SingularException.
A separate, opt-in entry point — the direct solves stay exact and keep throwing.

Why not the bordered/Woodbury path: solving [S U; V -I][x;y]=[b;0] in least squares
minimizes a different objective than ‖Ax−b‖ (measured residual 3–65× too large on
rank-deficient inconsistent systems), and there is no Woodbury formula for the
pseudoinverse of a sum. So this works on A directly.

Two engines:
- :dense (default) — LAPACK gelsy (complete-orthogonal decomposition) of the densified
  A. Exact (matches pinv(Matrix(A))*b to machine precision), O(n³)/O(n²). A = S+UV is
  always fully dense, so a direct solve cannot exploit the sparsity.
- :iterative — LSQR (default) / LSMR driven by A's structured matvec and adjoint, never
  forming the dense A; for large n. Converges to the same min-norm solution from x0=0.
  Lives in an IterativeSolvers extension (weakdep).

Also adds a structured adjoint/transpose matvec to core (Aᴴu = Sᴴu + Vᴴ(Uᴴu)) — a real
fix, since A'*u previously fell back to a ~1000× slower generic getindex path. This lets
the iterative engine drive A directly (no LinearMaps/LinearOperators dependency).

Engine choice: IterativeSolvers, not Krylov/LinearSolve's KrylovJL_LSMR. Measured: with a
correct adjoint, IterativeSolvers lsqr/lsmr reach ~1e-12 min-norm accuracy while Krylov
lsmr/lsqr halt early at ~1e-6 (and tightening tolerances does not help). Both converge —
Krylov is not "broken" — but IterativeSolvers is ~6 orders tighter, which matters here.

Adversarial review found and fixed: (1) lsqr's `isconverged` is unreliable (reports success
at the iteration cap with a wrong answer) — now flag on budget exhaustion and report the
true LS residual; (2) negative λ silently treated as |λ| by the dense path — now rejected;
(3) Int/Rational slipped the eltype guard (checked the float-promoted type) — now checks
the raw promoted type. Regression tests for each.

Stacked on #4 (the QR PR): test_lstsq.jl asserts qr(A) throws SingularException while lstsq
succeeds, so it depends on the QR entry point.

390/390 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
---
 Project.toml                                  |   6 +-
 README.md                                     |  42 ++++-
 ...hDenseRowColMatricesIterativeSolversExt.jl |  53 +++++++
 src/SparseWithDenseRowColMatrices.jl          |   3 +-
 src/lstsq.jl                                  | 100 ++++++++++++
 src/matvec.jl                                 |  54 +++++++
 test/runtests.jl                              |   3 +
 test/test_lstsq.jl                            | 149 ++++++++++++++++++
 8 files changed, 407 insertions(+), 3 deletions(-)
 create mode 100644 ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl
 create mode 100644 src/lstsq.jl
 create mode 100644 test/test_lstsq.jl

diff --git a/Project.toml b/Project.toml
index f86302c..ffaa145 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,14 +10,17 @@ PureKLU = "0c0d3e7f-3a8b-4f7e-b6f1-9a4d2e7c1f01"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [weakdeps]
+IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 
 [extensions]
+SparseWithDenseRowColMatricesIterativeSolversExt = "IterativeSolvers"
 SparseWithDenseRowColMatricesLinearSolveExt = "LinearSolve"
 
 [compat]
 Aqua = "0.8"
 ForwardDiff = "0.10, 1"
+IterativeSolvers = "0.9"
 LinearAlgebra = "1"
 LinearSolve = "3"
 PrecompileTools = "1"
@@ -31,6 +34,7 @@ julia = "1.10"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -39,4 +43,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "ForwardDiff", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"]
+test = ["Aqua", "ForwardDiff", "IterativeSolvers", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"]
diff --git a/README.md b/README.md
index 9f249be..7db9c3c 100644
--- a/README.md
+++ b/README.md
@@ -162,6 +162,42 @@ refactor since it has no symbolic reuse) and far slower than the Woodbury fast p
 error is comparable (~`1e-13`) for all three on a well-conditioned `A`. Use `qr` for the
 factorization/rank-revealing semantics, not for speed.
 
+## Rank-deficient / inconsistent least squares (`lstsq`)
+
+`\`, `factorize`/`lu`, and `qr` solve **nonsingular** systems and throw `SingularException`
+when `A` is singular. For a genuinely rank-deficient or inconsistent system, `lstsq(A, b)`
+returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Penrose solution: of
+all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`).
+
+```julia
+x = lstsq(A, b)                        # exact, dense COD (default)
+x = lstsq(A, b; alg = :iterative)      # structure-exploiting LSQR/LSMR (needs `using IterativeSolvers`)
+x = lstsq(A, b; λ = 1e-3)              # Tikhonov-damped (ridge) variant
+```
+
+This is a **separate, opt-in** entry point — `\`/`factorize`/`qr` stay exact and keep throwing
+on singular `A`; least squares never happens implicitly. The bordered/Woodbury machinery
+*cannot* produce `A⁺b`: solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a different
+objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent
+systems), and there is no Woodbury-style formula for the pseudoinverse of a sum.
+
+Two engines:
+
+* **`:dense`** (default) — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified
+  `A`. Exact (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` time / `O(n²)` memory.
+  Because `A = S + U·V` is **always fully dense** (the rank-`r` outer product fills every
+  entry), a direct solve genuinely cannot exploit the sparsity — so this is right up to `n` of
+  a few thousand.
+* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint
+  (`A*x = S*x + U(Vx)`, `Aᴴu = Sᴴu + Vᴴ(Uᴴu)`), so the dense `A` is **never formed** — for `n`
+  too large to densify. Started from `x0 = 0` it converges to the same minimum-norm solution.
+  It is approximate (to `atol`/`btol`) and fragile under ill-conditioning; if it exhausts its
+  `maxiters` budget it warns and reports the achieved least-squares residual. Lives in a package
+  extension (`using IterativeSolvers`).
+
+Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. Adjoint/transpose matvec
+(`A'*u`, `Aᴴ`, `Aᵀ`) is also useful on its own and is computed by the same structured kernel.
+
 ## Benchmarks
 
 A banded sparse interior plus `r = 8` dense rows (`n = 5000`). Factoring only `S` and
@@ -194,9 +230,10 @@ globally-scattered `S` degrades every sparse method.)
 * [x] Matrix–vector / matrix–matrix multiply (`mul!` vector path is allocation-free)
 * [x] Factorization and linear solve (`factorize`/`lu`, `\`, `ldiv!`)
 * [x] In-place refactorization with a fixed sparsity pattern (`refactor!`)
-* [x] Adjoint / transpose solves
+* [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`)
 * [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury path
 * [x] Rank-revealing QR (`qr`) for ill-conditioned `S` (`Float64`/`ComplexF64` only)
+* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A`
 
 `qr(A)` builds the augmented sparse-QR factorization (above). It is the stability path, not
 the throughput path: no symbolic-reuse refactor, no zero-allocation solve, and `Float64`
@@ -317,12 +354,15 @@ augmented mode exists, and augmented QR needs no refinement); singular `A` likew
 SparseWithDenseRowColMatrix    SelectorMatrix
 sparsepart  fillpart  lowrankfactors  exclusive_sparsepart  denserank
 factorize   lu   qr   \\   ldiv!   refactor!   lu!   qr!   update_lowrank!
+lstsq                                  # minimum-norm least squares (rank-deficient / inconsistent)
 SparseWithDenseRowColWoodbury  SparseWithDenseRowColAugmented  SparseWithDenseRowColQRAugmented
 recommend_lowrank_peel   PeelRecommendation
 SparseWithDenseRowColFactorization     # LinearSolve.jl algorithm (LU/Woodbury, default)
 SparseWithDenseRowColQRFactorization   # LinearSolve.jl algorithm (stable QR, opt-in)
 ```
 
+The `:iterative` engine of `lstsq` requires `using IterativeSolvers` (a package extension).
+
 ## Installation
 
 PureKLU.jl is not yet in the General registry; install it from the SciML repository:
diff --git a/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl
new file mode 100644
index 0000000..6441f10
--- /dev/null
+++ b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl
@@ -0,0 +1,53 @@
+module SparseWithDenseRowColMatricesIterativeSolversExt
+
+using SparseWithDenseRowColMatrices
+using SparseWithDenseRowColMatrices: SparseWithDenseRowColMatrix, _check_lstsq_eltype
+using IterativeSolvers
+using LinearAlgebra: LinearAlgebra
+
+# Iterative engine for `lstsq(A, b; alg=:iterative)`. LSQR (default) / LSMR are driven entirely
+# by A's structured matvec and adjoint (`mul!(y, A, x)` / `mul!(y, A', x)` defined in
+# src/matvec.jl), so the dense `A = S + U·V` is never formed. Started from x0 = 0, both
+# converge to the MINIMUM-NORM least-squares solution for rank-deficient / inconsistent systems
+# (the min-norm guarantee is contingent on the zero start — the solver is never given a guess).
+function SparseWithDenseRowColMatrices._lstsq_iterative(
+        A::SparseWithDenseRowColMatrix, b::AbstractVector;
+        solver::Symbol = :lsqr, atol::Real = 1.0e-12, btol::Real = 1.0e-12,
+        maxiters::Integer = 20 * size(A, 1), λ::Real = 0, verbose::Bool = false,
+        # accept and ignore the dense-only keyword so `alg` can be flipped without edits
+        rcond = nothing,
+    )
+    n = size(A, 1)
+    length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))"))
+    _check_lstsq_eltype(promote_type(eltype(A), eltype(b)))   # reject non-BLAS-float up front
+    λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ"))
+
+    x, ch = if solver === :lsqr
+        IterativeSolvers.lsqr(A, b; damp = λ, atol = atol, btol = btol, maxiter = maxiters, log = true)
+    elseif solver === :lsmr
+        IterativeSolvers.lsmr(A, b; λ = λ, atol = atol, btol = btol, maxiter = maxiters, log = true)
+    else
+        throw(ArgumentError("solver must be :lsqr or :lsmr; got :$solver"))
+    end
+
+    # IterativeSolvers' `isconverged` is unreliable here (lsqr reports success even when it
+    # stops at the iteration cap with a wrong answer), so flag on budget exhaustion — which
+    # cleanly separates converged from non-converged for both lsqr and lsmr — and report the
+    # true least-squares optimality ‖Aᴴ(Ax−b)‖ (one cheap structured adjoint matvec; →0 at a
+    # minimizer) rather than trusting the solver's own flag.
+    capped = ch.iters ≥ maxiters
+    verbose && @info "lstsq(alg=:iterative)" solver converged = !capped iters = ch.iters
+    if capped
+        aresid = LinearAlgebra.norm(A' * (A * x .- b))
+        @warn(
+            "lstsq(...; alg=:iterative, solver=:$solver) hit the $maxiters-iteration budget " *
+                "without converging — the result is NOT a reliable least-squares solution. " *
+                "Raise `maxiters`, or use the exact `alg=:dense` if `A` is small enough to densify.",
+            least_squares_residual = aresid,
+            iters = ch.iters,
+        )
+    end
+    return x
+end
+
+end # module
diff --git a/src/SparseWithDenseRowColMatrices.jl b/src/SparseWithDenseRowColMatrices.jl
index 911c33b..2aa2dfc 100644
--- a/src/SparseWithDenseRowColMatrices.jl
+++ b/src/SparseWithDenseRowColMatrices.jl
@@ -17,12 +17,13 @@ include("woodbury.jl")
 include("augmented.jl")
 include("qr.jl")
 include("factorize.jl")
+include("lstsq.jl")
 include("detect.jl")
 
 export SparseWithDenseRowColMatrix, SelectorMatrix,
     sparsepart, fillpart, lowrankfactors, exclusive_sparsepart, denserank,
     SparseWithDenseRowColWoodbury, SparseWithDenseRowColAugmented,
-    SparseWithDenseRowColQRAugmented, refactor!, update_lowrank!,
+    SparseWithDenseRowColQRAugmented, refactor!, update_lowrank!, lstsq,
     recommend_lowrank_peel, PeelRecommendation,
     SparseWithDenseRowColFactorization, SparseWithDenseRowColQRFactorization
 
diff --git a/src/lstsq.jl b/src/lstsq.jl
new file mode 100644
index 0000000..7b8aade
--- /dev/null
+++ b/src/lstsq.jl
@@ -0,0 +1,100 @@
+# ------------------
+# Rank-deficient / inconsistent least squares: minimum-norm solution x = A⁺b
+# ------------------
+#
+# `\`, `factorize`/`lu`, and `qr` solve NONSINGULAR systems and throw on a singular `A`. For a
+# genuinely rank-deficient or inconsistent system one wants the Moore–Penrose solution
+# x = A⁺b (the minimum-2-norm vector among the minimizers of ‖Ax−b‖). The bordered/Woodbury
+# machinery CANNOT produce it — solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a
+# different objective than `‖Ax−b‖`, and there is no Woodbury-style formula for the
+# pseudoinverse of a sum. So this is a separate, opt-in entry point.
+#
+# Two engines:
+#   :dense (default)     — complete-orthogonal decomposition (LAPACK gelsy) of the densified A.
+#                          Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory. Since
+#                          A = S + U·V is always fully dense (the rank-r outer product fills
+#                          every entry), there is no sparsity to exploit in a direct solve.
+#   :iterative           — LSQR/LSMR (IterativeSolvers) driven by A's structured matvec/adjoint,
+#                          never forming the dense A. For n too large to densify. Approximate to
+#                          a tolerance, fragile under ill-conditioning. Lives in an extension
+#                          (`using IterativeSolvers`).
+
+_lstsq_supported(::Type{<:Union{Float32, Float64, ComplexF32, ComplexF64}}) = true
+_lstsq_supported(::Type) = false
+
+function _check_lstsq_eltype(::Type{T}) where {T}
+    _lstsq_supported(T) || throw(
+        ArgumentError(
+            "lstsq supports only Float32/Float64/ComplexF32/ComplexF64 (it uses LAPACK / a BLAS-float " *
+                "iterative solver); got eltype $T."
+        )
+    )
+    return nothing
+end
+
+# Iterative engine — method added by the IterativeSolvers extension; no method in core.
+function _lstsq_iterative end
+
+"""
+    lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:dense, kwargs...) -> x
+
+Minimum-norm least-squares solution `x ≈ A⁺b` (the Moore–Penrose pseudoinverse solution),
+correct for **rank-deficient and/or inconsistent** systems where `\\`/`factorize`/`qr` would
+throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`.
+
+* `alg = :dense` (default): complete-orthogonal decomposition (LAPACK `gelsy`) of the densified
+  `A`. Exact, `O(n³)` time and `O(n²)` memory. `A = S + U·V` is always fully dense, so a direct
+  solve cannot exploit the sparsity; this is the right choice up to `n` of a few thousand.
+* `alg = :iterative`: LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint,
+  never forming the dense `A` — for `n` too large to densify. Requires `using IterativeSolvers`.
+  Approximate to a tolerance and fragile under ill-conditioning; see the keyword options.
+
+Keywords: `rcond` (`:dense` rank cutoff), `solver` (`:lsqr`/`:lsmr`), `atol`, `btol`,
+`maxiters`, `λ` (Tikhonov damping; `0` = pure minimum-norm), `verbose` (`:iterative`).
+Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported.
+
+`\\`, `factorize`/`lu`, and `qr` are unchanged — they stay exact and throw on singular `A`;
+least squares is always an explicit opt-in via `lstsq`.
+"""
+function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :dense, kwargs...)
+    if alg === :dense
+        return _lstsq_dense(A, b; kwargs...)
+    elseif alg === :iterative
+        isempty(methods(_lstsq_iterative)) && throw(
+            ArgumentError(
+                "lstsq(...; alg=:iterative) requires `using IterativeSolvers` (the iterative engine " *
+                    "lives in a package extension)."
+            )
+        )
+        return _lstsq_iterative(A, b; kwargs...)
+    else
+        throw(ArgumentError("alg must be :dense or :iterative; got :$alg"))
+    end
+end
+
+function _lstsq_dense(
+        A::SparseWithDenseRowColMatrix, b::AbstractVector;
+        rcond::Real = -one(real(float(eltype(A)))), λ::Real = 0,
+        # accept and ignore iterative-only keywords so `alg` can be flipped without edits
+        solver = nothing, atol = nothing, btol = nothing, maxiters = nothing, verbose = nothing,
+    )
+    n = size(A, 1)
+    length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))"))
+    λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ"))
+    _check_lstsq_eltype(promote_type(eltype(A), eltype(b)))   # reject non-BLAS-float (e.g. Int) up front
+    TT = float(promote_type(eltype(A), eltype(b)))
+    rc = rcond < 0 ? eps(real(TT)) * n : real(TT)(rcond)
+    M = Matrix(A)
+    Md = eltype(M) === TT ? copy(M) : TT.(M)
+    if iszero(λ)
+        rhs = collect(TT, b)
+        x, _ = LinearAlgebra.LAPACK.gelsy!(Md, rhs, rc)
+        return x
+    else
+        # Tikhonov: minimize ‖A x − b‖² + λ²‖x‖² = ‖[A; λI] x − [b; 0]‖²
+        stacked = vcat(Md, real(TT)(λ) * Matrix{TT}(LinearAlgebra.I, n, n))
+        rhs = vcat(collect(TT, b), zeros(TT, n))
+        x, _ = LinearAlgebra.LAPACK.gelsy!(stacked, rhs, rc)
+        return x[1:n]
+    end
+end
diff --git a/src/matvec.jl b/src/matvec.jl
index 9359f9a..b38a0fe 100644
--- a/src/matvec.jl
+++ b/src/matvec.jl
@@ -115,3 +115,57 @@ function Base.:*(A::SparseWithDenseRowColMatrix, X::AbstractMatrix)
     Y = Matrix{T}(undef, size(A, 1), size(X, 2))
     return mul!(Y, A, X, one(T), zero(T))
 end
+
+# ------------------
+# Adjoint / transpose matvec:  Aᴴu = Sᴴu + Vᴴ(Uᴴu)   (and Aᵀ with no conjugation)
+# ------------------
+# Without these, `mul!(y, A', u)` / `A'*u` fall back to LinearAlgebra's generic `getindex`
+# path, which materializes columns of `A` and is ~1000× slower than the structured product —
+# crippling for an iterative least-squares solve, whose every step needs an adjoint matvec.
+
+# w = Uᴴu (length r). Selector U = [I_r; 0] ⇒ (Uᴴu)_k = u_k.
+function _lowrank_adjU!(w, U::AbstractMatrix, u, cnj)
+    @inbounds for k in axes(U, 2)
+        s = zero(eltype(w))
+        for i in axes(U, 1)
+            s += cnj(U[i, k]) * u[i]
+        end
+        w[k] = s
+    end
+    return w
+end
+function _lowrank_adjU!(w, U::SelectorMatrix, u, cnj)
+    @inbounds for k in 1:U.r
+        w[k] = u[k]
+    end
+    return w
+end
+
+# `Sop` is `adjoint`/`transpose`, `cnj` is `conj`/`identity` (for the dense U/V blocks).
+function _adjoint_matvec!(
+        y::AbstractVector, A::SparseWithDenseRowColMatrix, u::AbstractVector,
+        α::Number, β::Number, Sop, cnj
+    )
+    mul!(y, Sop(A.S), u, α, β)                 # y .= α Sᴴu + β y
+    U, V = A.U, A.V
+    r = size(V, 1)
+    w = Vector{promote_type(eltype(A), eltype(u))}(undef, r)
+    _lowrank_adjU!(w, U, u, cnj)               # w = Uᴴu  (length r)
+    @inbounds for k in 1:r
+        wk = α * w[k]
+        for j in axes(V, 2)
+            y[j] += cnj(V[k, j]) * wk           # y .+= α Vᴴw
+        end
+    end
+    return y
+end
+
+for (Wrap, Sop, cnj) in ((:Adjoint, :adjoint, :conj), (:Transpose, :transpose, :identity))
+    @eval LinearAlgebra.mul!(
+        y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector,
+        α::Number, β::Number,
+    ) = _adjoint_matvec!(y, parent(wA), u, α, β, $Sop, $cnj)
+    @eval LinearAlgebra.mul!(
+        y::AbstractVector, wA::$Wrap{<:Any, <:SparseWithDenseRowColMatrix}, u::AbstractVector,
+    ) = mul!(y, wA, u, true, false)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 78949fb..e2abce2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -16,6 +16,9 @@ using SafeTestsets, Test
     @safetestset "QR factorization (augmented, stable)" begin
         include("test_qr.jl")
     end
+    @safetestset "Rank-deficient least squares (min-norm A⁺b)" begin
+        include("test_lstsq.jl")
+    end
     @safetestset "Refactorization (Newton path)" begin
         include("test_refactor.jl")
     end
diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl
new file mode 100644
index 0000000..5e1e0b8
--- /dev/null
+++ b/test/test_lstsq.jl
@@ -0,0 +1,149 @@
+include("testutils.jl")
+using Test, LinearAlgebra, SparseArrays
+import IterativeSolvers
+
+# A = S + U V with EXACTLY rank n-1: zero column/row n of S and zero column n of V make column
+# n of A identically zero, so A is genuinely rank-deficient while keeping the S+UV structure.
+function rankdef(n, r; seed = 1, T = Float64, selector = false)
+    rng = MersenneTwister(seed)
+    S = spdiagm(0 => fill(T(4), n), 1 => fill(T(-1), n - 1), -1 => fill(T(-1), n - 1))
+    S[:, n] .= zero(T); S[n, :] .= zero(T); dropzeros!(S)
+    U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r))
+    V = T.(randn(rng, r, n)); V[:, n] .= zero(T)
+    return SparseWithDenseRowColMatrix(S, U, V)
+end
+
+minnorm(M, b) = pinv(M) * b      # the oracle: minimum-norm least-squares solution A⁺b
+
+@testset "adjoint / transpose matvec (core, structured Sᴴu + Vᴴ(Uᴴu))" begin
+    for (lbl, A) in (
+            ("dense-U", rankdef(70, 3; seed = 1)),
+            ("selector-U", rankdef(70, 3; seed = 2, selector = true)),
+        )
+        M = Matrix(A); u = randn(70)
+        @test A' * u ≈ M' * u
+        @test transpose(A) * u ≈ transpose(M) * u
+        @test mul!(similar(u), A', u) ≈ M' * u
+        y = randn(70); ref = 2.0 .* (M' * u) .+ 3.0 .* y   # 5-arg in-place
+        mul!(y, A', u, 2.0, 3.0)
+        @test y ≈ ref
+    end
+    # complex: adjoint (conjugating) ≠ transpose
+    rng = MersenneTwister(9); n, r = 60, 2
+    S = spdiagm(0 => fill(ComplexF64(4), n), 1 => fill(ComplexF64(-1), n - 1), -1 => fill(ComplexF64(-1), n - 1))
+    Ac = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), randn(rng, ComplexF64, r, n))
+    Mc = Matrix(Ac); u = randn(ComplexF64, n)
+    @test Ac' * u ≈ Mc' * u
+    @test transpose(Ac) * u ≈ transpose(Mc) * u
+    @test !(Ac' * u ≈ transpose(Ac) * u)        # genuinely different for complex
+end
+
+@testset "dense engine: minimum-norm LS on rank-deficient A (vs pinv oracle)" begin
+    for seed in 1:4
+        A = rankdef(80, 3; seed = seed)
+        M = Matrix(A)
+        @test rank(M) == 79                      # genuinely rank-deficient
+        N = nullspace(M)                         # n × 1 here
+        for (kind, b) in (
+                (:consistent, M * randn(80)),
+                (:inconsistent, randn(80)),
+            )
+            xref = minnorm(M, b)
+            x = lstsq(A, b)                       # alg = :dense default
+            @test relerr(x, xref) < 1.0e-10
+            @test norm(x) ≈ norm(xref) rtol = 1.0e-9          # minimum-norm
+            @test norm(M' * (M * x - b)) < 1.0e-8 * norm(M)   # least-squares optimality
+            @test norm(N' * x) < 1.0e-9                       # no null-space component
+            kind === :consistent && @test norm(M * x - b) < 1.0e-9
+        end
+    end
+end
+
+@testset "iterative engine recovers the same minimum-norm solution" begin
+    for seed in 1:3, solver in (:lsqr, :lsmr)
+        A = rankdef(80, 3; seed = seed)
+        M = Matrix(A)
+        b = seed == 1 ? M * randn(80) : randn(80)
+        xref = minnorm(M, b)
+        x = lstsq(A, b; alg = :iterative, solver = solver, atol = 1.0e-13, btol = 1.0e-13)
+        @test relerr(x, xref) < 1.0e-6
+        @test norm(x) ≈ norm(xref) rtol = 1.0e-5
+        @test relerr(x, lstsq(A, b)) < 1.0e-6     # agrees with the dense engine
+    end
+end
+
+@testset "SelectorMatrix-U (boundary-condition form)" begin
+    A = rankdef(80, 2; seed = 5, selector = true)
+    M = Matrix(A); b = randn(80)
+    xref = minnorm(M, b)
+    @test relerr(lstsq(A, b), xref) < 1.0e-10
+    @test relerr(lstsq(A, b; alg = :iterative), xref) < 1.0e-6
+end
+
+@testset "full-rank square reduces to the ordinary solve" begin
+    A = rand_sparsedense(60, 3; seed = 11)
+    M = Matrix(A); b = randn(60)
+    @test relerr(lstsq(A, b), M \ b) < 1.0e-10
+    @test relerr(lstsq(A, b; alg = :iterative), M \ b) < 1.0e-6
+end
+
+@testset "complex rank-deficient LS" begin
+    rng = MersenneTwister(21); n, r = 70, 2
+    S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1))
+    S[:, n] .= 0; S[n, :] .= 0; dropzeros!(S)
+    V = randn(rng, ComplexF64, r, n); V[:, n] .= 0
+    A = SparseWithDenseRowColMatrix(S, randn(rng, ComplexF64, n, r), V)
+    M = Matrix(A); b = randn(ComplexF64, n)
+    @test rank(M) == n - 1
+    @test relerr(lstsq(A, b), minnorm(M, b)) < 1.0e-10
+    @test relerr(lstsq(A, b; alg = :iterative), minnorm(M, b)) < 1.0e-6
+end
+
+@testset "Tikhonov damping λ > 0 (both engines)" begin
+    A = rankdef(60, 2; seed = 7)
+    M = Matrix(A); b = randn(60); λ = 0.1
+    xref = (M' * M + λ^2 * I) \ (M' * b)         # the damped normal-equations solution
+    @test relerr(lstsq(A, b; λ = λ), xref) < 1.0e-9
+    @test relerr(lstsq(A, b; alg = :iterative, λ = λ, atol = 1.0e-13, btol = 1.0e-13), xref) < 1.0e-6
+end
+
+@testset "lstsq is a separate opt-in: \\ / factorize / qr still throw on singular A" begin
+    A = rankdef(50, 2; seed = 3)
+    b = randn(50)
+    @test_throws SingularException factorize(A; strategy = :augmented)
+    @test_throws SingularException qr(A)
+    @test_throws SingularException (A \ b)
+    @test relerr(lstsq(A, b), minnorm(Matrix(A), b)) < 1.0e-10   # but lstsq succeeds
+end
+
+@testset "eltype guard and argument validation" begin
+    A = rankdef(20, 2; seed = 1, T = BigFloat)   # gelsy / BLAS-iterative cannot do BigFloat
+    @test_throws ArgumentError lstsq(A, randn(BigFloat, 20))
+    # non-BLAS-float (Int) must be rejected, not silently promoted to a Float64 answer
+    Si = spdiagm(0 => fill(4, 20), 1 => fill(-1, 19), -1 => fill(-1, 19))
+    Ui = zeros(Int, 20, 2); Ui[1, 1] = 1; Ui[2, 2] = 1
+    Vi = zeros(Int, 2, 20); Vi[1, 1] = 1; Vi[2, 2] = 1
+    Ai = SparseWithDenseRowColMatrix(Si, Ui, Vi)
+    @test_throws ArgumentError lstsq(Ai, ones(Int, 20))
+    @test_throws ArgumentError lstsq(Ai, ones(Int, 20); alg = :iterative)
+
+    Af = rankdef(20, 2; seed = 1)
+    bf = randn(20)
+    @test_throws ArgumentError lstsq(Af, bf; alg = :bogus)
+    @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, solver = :bogus)
+    @test_throws DimensionMismatch lstsq(Af, randn(19))
+    # negative Tikhonov damping rejected by BOTH engines (dense must not silently use |λ|)
+    @test_throws ArgumentError lstsq(Af, bf; λ = -1.0)
+    @test_throws ArgumentError lstsq(Af, bf; alg = :iterative, λ = -1.0)
+end
+
+@testset "iterative non-convergence warns (isconverged is unreliable for lsqr)" begin
+    A = rankdef(60, 3; seed = 4)
+    b = randn(60)
+    # a far-too-small budget must WARN for both solvers (the bug was lsqr silently returning a
+    # wrong answer with isconverged=true; we now flag on budget exhaustion)
+    @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsqr, maxiters = 2)
+    @test_logs (:warn,) match_mode = :any lstsq(A, b; alg = :iterative, solver = :lsmr, maxiters = 2)
+    # an ample budget converges, so the result is accurate (and the warn branch is not taken)
+    @test relerr(lstsq(A, b; alg = :iterative, maxiters = 20 * 60), minnorm(Matrix(A), b)) < 1.0e-6
+end

From 699afb6a460ef14cf5bbfe4e17b80b341d262a5d Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Sun, 31 May 2026 07:54:06 -0400
Subject: [PATCH 2/3] lstsq: add a structure-exploiting DIRECT engine
 (capacitance) + make :auto the default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `alg=:structured` and makes `alg=:auto` the default for `lstsq`. When S is nonsingular,
A = S(I + ZV) with Z = S⁻¹U, and the entire rank deficiency collapses into the small r×r
capacitance C = I + V Z (nullity(A) = nullity(C), since det A = det S · det C). So A⁺b is
assembled from ONE sparse factorization of S (PureKLU) + r solves for Z + small dense SVD/QR
on r×r and n×s (s ≤ 2r) blocks — A is NEVER formed or factored densely. O(factor(S) + n·r²);
measured 191× faster than the dense COD at n=2000, r=6. Handles consistent AND inconsistent b
(a rank-k projection of b onto range(A) via the left-null basis S⁻ᴴVᴴnull(Cᴴ)).

:auto uses :structured when S is nonsingular & well-conditioned, else the exact :dense COD.
The structured route inverts S, so it is guarded two ways and falls back on failure: (1) klu
throws on exactly-singular S; (2) near-singular S is caught by a κ̂(S)=‖S‖₁‖Z‖/‖U‖ estimate
(klu does NOT throw there — it silently returns garbage Z, verified) plus a post-hoc
least-squares-optimality check. `alg=:structured` (forced) errors rather than return a wrong
answer; `alg=:auto` falls back to :dense.

Every formula was derived and verified against the pinv(Matrix(A))*b oracle before
implementation (consistent/inconsistent × rank deficiency k=1,2,3 × dense-U/SelectorMatrix-U ×
complex): relerr ~1e-15, min-norm + LS-optimality + null-space-free certificates. The
inconsistent-case projection is essential — the naive M⁺S⁻¹b shortcut is 23% wrong on
inconsistent b (the LS objective ‖Ax−b‖=‖S(Mx−S⁻¹b)‖ is S-weighted).

581/581 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean. No new deps
(reuses the existing PureKLU dependency for the S factorization).

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
---
 README.md          |  51 +++++++++-----
 src/lstsq.jl       | 168 +++++++++++++++++++++++++++++++++++++++------
 test/test_lstsq.jl |  79 ++++++++++++++++++++-
 3 files changed, 256 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 7db9c3c..d6ec5f5 100644
--- a/README.md
+++ b/README.md
@@ -170,8 +170,10 @@ returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Pen
 all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`).
 
 ```julia
-x = lstsq(A, b)                        # exact, dense COD (default)
-x = lstsq(A, b; alg = :iterative)      # structure-exploiting LSQR/LSMR (needs `using IterativeSolvers`)
+x = lstsq(A, b)                        # :auto — structured direct, dense fallback (default)
+x = lstsq(A, b; alg = :structured)     # structure-exploiting direct solve (never densifies A)
+x = lstsq(A, b; alg = :dense)          # exact dense complete-orthogonal decomposition (gelsy)
+x = lstsq(A, b; alg = :iterative)      # LSQR/LSMR (needs `using IterativeSolvers`)
 x = lstsq(A, b; λ = 1e-3)              # Tikhonov-damped (ridge) variant
 ```
 
@@ -181,22 +183,31 @@ on singular `A`; least squares never happens implicitly. The bordered/Woodbury m
 objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent
 systems), and there is no Woodbury-style formula for the pseudoinverse of a sum.
 
-Two engines:
-
-* **`:dense`** (default) — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified
-  `A`. Exact (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` time / `O(n²)` memory.
-  Because `A = S + U·V` is **always fully dense** (the rank-`r` outer product fills every
-  entry), a direct solve genuinely cannot exploit the sparsity — so this is right up to `n` of
-  a few thousand.
-* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint
-  (`A*x = S*x + U(Vx)`, `Aᴴu = Sᴴu + Vᴴ(Uᴴu)`), so the dense `A` is **never formed** — for `n`
-  too large to densify. Started from `x0 = 0` it converges to the same minimum-norm solution.
-  It is approximate (to `atol`/`btol`) and fragile under ill-conditioning; if it exhausts its
-  `maxiters` budget it warns and reports the achieved least-squares residual. Lives in a package
-  extension (`using IterativeSolvers`).
-
-Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported. Adjoint/transpose matvec
-(`A'*u`, `Aᴴ`, `Aᵀ`) is also useful on its own and is computed by the same structured kernel.
+Engines (`:auto` picks the first that applies):
+
+* **`:structured`** — a **direct** solve that **never densifies `A`**. When `S` is nonsingular,
+  `A = S(I + ZV)` with `Z = S⁻¹U`, and the *entire* rank deficiency collapses into the small
+  `r × r` capacitance `C = I + V Z` (`nullity(A) = nullity(C)`, since `det A = det S · det C`).
+  So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) +
+  small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100×
+  faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles
+  consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). Requires `S`
+  nonsingular and well-conditioned — it errors otherwise (where inverting `S` would be silently
+  wrong), so `:auto` falls back to `:dense` there.
+* **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact
+  (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback
+  when `S` is singular/near-singular (the structured route inverts `S`, so it can't apply there).
+* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never
+  forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor
+  `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution;
+  approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence.
+  Lives in an extension (`using IterativeSolvers`).
+
+`:auto` (the default) uses `:structured` when `S` is nonsingular and well-conditioned,
+otherwise the exact `:dense` COD — so you get the fast structure-exploiting direct solve when it
+is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are
+supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured
+kernel and is useful on its own.
 
 ## Benchmarks
 
@@ -233,7 +244,9 @@ globally-scattered `S` degrades every sparse method.)
 * [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`)
 * [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury path
 * [x] Rank-revealing QR (`qr`) for ill-conditioned `S` (`Float64`/`ComplexF64` only)
-* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A`
+* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` — a
+  structure-exploiting **direct** solve (capacitance method) when `S` is nonsingular, >100×
+  faster than densifying; exact dense COD fallback otherwise
 
 `qr(A)` builds the augmented sparse-QR factorization (above). It is the stability path, not
 the throughput path: no symbolic-reuse refactor, no zero-allocation solve, and `Float64`
diff --git a/src/lstsq.jl b/src/lstsq.jl
index 7b8aade..719ae0d 100644
--- a/src/lstsq.jl
+++ b/src/lstsq.jl
@@ -9,15 +9,23 @@
 # different objective than `‖Ax−b‖`, and there is no Woodbury-style formula for the
 # pseudoinverse of a sum. So this is a separate, opt-in entry point.
 #
-# Two engines:
-#   :dense (default)     — complete-orthogonal decomposition (LAPACK gelsy) of the densified A.
-#                          Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory. Since
-#                          A = S + U·V is always fully dense (the rank-r outer product fills
-#                          every entry), there is no sparsity to exploit in a direct solve.
+# Engines:
+#   :auto (default)      — try :structured, fall back to :dense when S is singular/near-singular.
+#   :structured          — DIRECT and structure-exploiting (no densifying). When S is nonsingular,
+#                          A = S(I + ZV) with Z = S⁻¹U, and the entire rank deficiency collapses
+#                          into the small r×r capacitance C = I + V Z (nullity(A) = nullity(C)).
+#                          So A⁺b is built from ONE sparse factorization of S (PureKLU) + r solves
+#                          (Z) + small dense SVD/QR on r×r and n×s (s ≤ 2r) blocks — never forming
+#                          or factoring the dense A. O(factor(S) + n·r²); >100× faster than dense
+#                          COD for large n. Requires S nonsingular & well-conditioned (guarded).
+#   :dense               — complete-orthogonal decomposition (LAPACK gelsy) of the densified A.
+#                          Exact (matches `pinv(Matrix(A))*b`), O(n³) time / O(n²) memory; the
+#                          mandatory fallback when S is singular/near-singular (where the
+#                          structured route is silently wrong).
 #   :iterative           — LSQR/LSMR (IterativeSolvers) driven by A's structured matvec/adjoint,
-#                          never forming the dense A. For n too large to densify. Approximate to
-#                          a tolerance, fragile under ill-conditioning. Lives in an extension
-#                          (`using IterativeSolvers`).
+#                          never forming the dense A. For n too large to densify when S is also
+#                          singular (so neither :structured nor :dense apply). Approximate to a
+#                          tolerance, fragile under ill-conditioning. Lives in an extension.
 
 _lstsq_supported(::Type{<:Union{Float32, Float64, ComplexF32, ComplexF64}}) = true
 _lstsq_supported(::Type) = false
@@ -36,28 +44,51 @@ end
 function _lstsq_iterative end
 
 """
-    lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:dense, kwargs...) -> x
+    lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg=:auto, kwargs...) -> x
 
 Minimum-norm least-squares solution `x ≈ A⁺b` (the Moore–Penrose pseudoinverse solution),
 correct for **rank-deficient and/or inconsistent** systems where `\\`/`factorize`/`qr` would
-throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`.
+throw. Among all `x` minimizing `‖A x − b‖₂`, returns the one of smallest `‖x‖₂`. Both the
+consistent and inconsistent cases are handled (no user flag).
 
-* `alg = :dense` (default): complete-orthogonal decomposition (LAPACK `gelsy`) of the densified
-  `A`. Exact, `O(n³)` time and `O(n²)` memory. `A = S + U·V` is always fully dense, so a direct
-  solve cannot exploit the sparsity; this is the right choice up to `n` of a few thousand.
-* `alg = :iterative`: LSQR (default) / LSMR driven by `A`'s structured matvec and adjoint,
-  never forming the dense `A` — for `n` too large to densify. Requires `using IterativeSolvers`.
-  Approximate to a tolerance and fragile under ill-conditioning; see the keyword options.
+* `alg = :auto` (default): use the structure-exploiting **direct** method when `S` is
+  nonsingular and well-conditioned, automatically falling back to `:dense` otherwise.
+* `alg = :structured`: a **direct** solve that never densifies `A`. When `S` is nonsingular the
+  rank deficiency collapses into the small `r × r` capacitance `C = I + V S⁻¹U`, so `A⁺b` comes
+  from one sparse factorization of `S` (PureKLU) plus small dense work on `r × r` / `n × s`
+  (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, `>100×` faster than `:dense` for large `n`. Errors
+  if `S` is singular/near-singular (where the structured route would be silently wrong).
+* `alg = :dense`: complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`.
+  Exact, `O(n³)` / `O(n²)`; the mandatory fallback when `S` is singular.
+* `alg = :iterative`: LSQR / LSMR via `A`'s structured matvec/adjoint, never densifying — for
+  `n` too large to densify *and* `S` singular. Requires `using IterativeSolvers`; approximate.
 
-Keywords: `rcond` (`:dense` rank cutoff), `solver` (`:lsqr`/`:lsmr`), `atol`, `btol`,
-`maxiters`, `λ` (Tikhonov damping; `0` = pure minimum-norm), `verbose` (`:iterative`).
-Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported.
+Keywords: `tolC` (`:structured`/`:auto` rank cutoff for `C`), `rcond` (`:dense` rank cutoff),
+`solver` (`:lsqr`/`:lsmr`), `atol`, `btol`, `maxiters`, `λ` (Tikhonov damping; `0` = pure
+minimum-norm), `verbose`. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are supported.
 
 `\\`, `factorize`/`lu`, and `qr` are unchanged — they stay exact and throw on singular `A`;
 least squares is always an explicit opt-in via `lstsq`.
 """
-function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :dense, kwargs...)
-    if alg === :dense
+function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol = :auto, kwargs...)
+    if alg === :auto
+        # structured can't do Tikhonov; route λ>0 (and structured-unsafe S) to the exact dense COD
+        if iszero(get(kwargs, :λ, 0))
+            x, ok = _lstsq_structured(A, b; kwargs...)
+            ok && return x
+        end
+        return _lstsq_dense(A, b; kwargs...)
+    elseif alg === :structured
+        x, ok = _lstsq_structured(A, b; kwargs...)
+        ok || throw(
+            ArgumentError(
+                "lstsq(...; alg=:structured) is unsafe for this A: S is singular or near-singular, " *
+                    "so the structured route (which inverts S) would be silently wrong. Use alg=:dense " *
+                    "(exact) or alg=:auto (auto-fallback)."
+            )
+        )
+        return x
+    elseif alg === :dense
         return _lstsq_dense(A, b; kwargs...)
     elseif alg === :iterative
         isempty(methods(_lstsq_iterative)) && throw(
@@ -68,10 +99,103 @@ function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol =
         )
         return _lstsq_iterative(A, b; kwargs...)
     else
-        throw(ArgumentError("alg must be :dense or :iterative; got :$alg"))
+        throw(ArgumentError("alg must be :auto, :structured, :dense or :iterative; got :$alg"))
     end
 end
 
+# Structure-exploiting DIRECT min-norm least-squares (returns `(x, ok)`; `ok=false` signals the
+# caller to fall back to dense because S is singular/near-singular). With S nonsingular,
+# A = S(I + ZV), Z = S⁻¹U; the rank deficiency lives in C = I + V Z (nullity(A)=nullity(C)).
+# A⁺b is assembled from one PureKLU factorization of S + small dense SVD/QR — A is never formed.
+function _lstsq_structured(
+        A::SparseWithDenseRowColMatrix, b::AbstractVector;
+        tolC::Real = -one(real(float(eltype(A)))), λ::Real = 0,
+        # accept and ignore the other engines' keywords so `alg` can be flipped without edits
+        rcond = nothing, solver = nothing, atol = nothing, btol = nothing,
+        maxiters = nothing, verbose = nothing,
+    )
+    n = size(A, 1)
+    r = denserank(A)
+    length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))"))
+    λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ"))
+    iszero(λ) || throw(ArgumentError("lstsq(...; alg=:structured) does not support Tikhonov λ>0; use alg=:dense."))
+    RT = promote_type(eltype(A), eltype(b))
+    _check_lstsq_eltype(RT)
+    TT = float(RT)
+    rT = real(TT)
+    tol = tolC < 0 ? sqrt(eps(rT)) : rT(tolC)
+
+    Sown = _own_sparse(TT, A.S)
+    Sfact = try
+        PureKLU.klu(Sown)
+    catch e
+        e isa SingularException || rethrow(e)
+        return (zeros(TT, n), false)                # S exactly singular → fall back to dense
+    end
+
+    if r == 0                                       # A = S nonsingular ⇒ the solution is S⁻¹b
+        x = collect(TT, b)
+        PureKLU.solve!(Sfact, x)
+        return (x, true)
+    end
+
+    V = Matrix{TT}(A.V)                             # r × n
+    Z = Matrix{TT}(undef, n, r)
+    materialize_U!(Z, A.U)                          # Z = U
+    nrmU = norm(Z)
+    PureKLU.solve!(Sfact, Z)                        # Z = S⁻¹U
+
+    # Guard: near-singular S makes Z = S⁻¹U huge/garbage and `klu` does NOT throw, which both
+    # crashes the SVDs and gives a silently-wrong answer. κ̂(S) = ‖S‖₁·‖Z‖/‖U‖ ≳ κ(S); bail early.
+    kappaS = nrmU > 0 ? opnorm(Sown, 1) * norm(Z) / nrmU : zero(rT)
+    (all(isfinite, Z) && isfinite(kappaS) && kappaS ≤ inv(sqrt(eps(rT)))) ||
+        return (zeros(TT, n), false)
+
+    Vadj = Matrix(V')                              # n × r  (adjoint; conjugate for complex)
+    C = V * Z                                       # r × r capacitance
+    @inbounds for i in 1:r
+        C[i, i] += one(TT)
+    end
+    Fc = svd(C)
+    kdef = count(<(tol * Fc.S[1]), Fc.S)            # nullity(A) = nullity(C)
+
+    # orthonormal basis W (n × s, s ≤ 2r) of the active subspace span([Z | Vᴴ]); M is the
+    # identity off this subspace, so Mᴴ-pseudoinverse work happens only on the small s × s block.
+    QB = qr(hcat(Z, Vadj), ColumnNorm())
+    Rd = abs.(diag(QB.R))
+    s = isempty(Rd) ? 0 : count(>(tol * maximum(Rd)), Rd)
+    W = Matrix(QB.Q)[:, 1:s]
+    Msmall = W' * (Z * (V * W))                     # s × s
+    @inbounds for i in 1:s
+        Msmall[i, i] += one(TT)
+    end
+    Fs = svd(Msmall)
+    sp = [σ > tol * Fs.S[1] ? inv(σ) : zero(rT) for σ in Fs.S]   # truncated pinv of Msmall
+
+    btil = collect(TT, b)
+    if kdef > 0                                     # project b onto range(A) (no-op when full rank)
+        Nl = Fc.U[:, (end - kdef + 1):end]          # null(Cᴴ)  (r × kdef)
+        LrA = Vadj * Nl                             # n × kdef
+        PureKLU.solve!(Sfact', LrA)                 # left-null(A) = S⁻ᴴ Vᴴ null(Cᴴ)
+        QL = Matrix(qr(LrA).Q)[:, 1:kdef]
+        btil .-= QL * (QL' * btil)
+    end
+    PureKLU.solve!(Sfact, btil)                     # btil = S⁻¹ b̃  (= c)
+    Wtc = W' * btil
+    x = W * (Fs.V * (sp .* (Fs.U' * Wtc))) + (btil - W * Wtc)   # x = Mᴴ⁺ c
+
+    # post-hoc least-squares optimality check (cheap structured matvecs): catches a misjudged
+    # rank/nullity. ‖Aᴴ(Ax−b)‖ → 0 at a minimizer; a loose threshold separates a good solution
+    # (~1e-14 relative) from a grossly wrong one (~1). A false trip only falls `:auto` back to the
+    # exact dense path, so erring conservative is safe.
+    res = A * x
+    res .-= b
+    atr = A' * res
+    nrm_atr = norm(atr)
+    ok = nrm_atr ≤ 1.0e-6 * (norm(A' * collect(TT, b)) + nrm_atr)
+    return (x, ok)
+end
+
 function _lstsq_dense(
         A::SparseWithDenseRowColMatrix, b::AbstractVector;
         rcond::Real = -one(real(float(eltype(A)))), λ::Real = 0,
diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl
index 5e1e0b8..98b0f0a 100644
--- a/test/test_lstsq.jl
+++ b/test/test_lstsq.jl
@@ -15,6 +15,22 @@ end
 
 minnorm(M, b) = pinv(M) * b      # the oracle: minimum-norm least-squares solution A⁺b
 
+# A = S + U V with S NONSINGULAR and A rank n-k: choose V so the r×r capacitance C = I + V S⁻¹U
+# has nullity k (det A = det S · det C). This is the regime the structured-direct engine targets.
+function rankdef_nonsingS(n, r, k; seed = 1, T = Float64, selector = false)
+    rng = MersenneTwister(seed)
+    S = spdiagm(-1 => fill(T(-1), n - 1), 0 => fill(T(4), n), 1 => fill(T(-1), n - 1))
+    U = selector ? SelectorMatrix{T}(n, r) : T.(randn(rng, n, r))
+    Z = Matrix(S) \ Matrix(U)
+    Q = Matrix(qr(T <: Complex ? randn(rng, T, r, r) : randn(rng, r, r)).Q)
+    d = ones(T, r)
+    for i in 1:k
+        d[i] = zero(T)
+    end
+    V = T.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z))
+    return SparseWithDenseRowColMatrix(S, U, V)
+end
+
 @testset "adjoint / transpose matvec (core, structured Sᴴu + Vᴴ(Uᴴu))" begin
     for (lbl, A) in (
             ("dense-U", rankdef(70, 3; seed = 1)),
@@ -49,7 +65,7 @@ end
                 (:inconsistent, randn(80)),
             )
             xref = minnorm(M, b)
-            x = lstsq(A, b)                       # alg = :dense default
+            x = lstsq(A, b; alg = :dense)
             @test relerr(x, xref) < 1.0e-10
             @test norm(x) ≈ norm(xref) rtol = 1.0e-9          # minimum-norm
             @test norm(M' * (M * x - b)) < 1.0e-8 * norm(M)   # least-squares optimality
@@ -59,6 +75,67 @@ end
     end
 end
 
+@testset "structured direct engine: min-norm LS without densifying (S nonsingular)" begin
+    for seed in 1:3, (n, r, k) in ((60, 3, 1), (90, 5, 2), (120, 8, 3)), sel in (false, true)
+        A = rankdef_nonsingS(n, r, k; seed = seed, selector = sel)
+        M = Matrix(A)
+        @test rank(M) == n - k
+        N = nullspace(M)
+        for (kind, b) in ((:consistent, M * randn(n)), (:inconsistent, randn(n)))
+            xref = minnorm(M, b)
+            x = lstsq(A, b; alg = :structured)
+            @test relerr(x, xref) < 1.0e-9
+            @test norm(x) ≈ norm(xref) rtol = 1.0e-8          # minimum-norm
+            @test norm(M' * (M * x - b)) < 1.0e-7 * norm(M)   # least-squares optimality
+            @test norm(N' * x) < 1.0e-8                       # no null-space component
+            kind === :consistent && @test norm(M * x - b) < 1.0e-8
+        end
+    end
+end
+
+@testset "structured engine: complex" begin
+    n, r, k = 70, 3, 1
+    rng = MersenneTwister(5)
+    S = spdiagm(0 => fill(ComplexF64(4), n), -1 => fill(ComplexF64(-1), n - 1), 1 => fill(ComplexF64(-1), n - 1))
+    U = randn(rng, ComplexF64, n, r); Z = Matrix(S) \ U
+    Q = Matrix(qr(randn(rng, ComplexF64, r, r)).Q); d = ones(r); d[1] = 0
+    V = ComplexF64.((Q * Diagonal(d) * Q' - I(r)) * pinv(Z))
+    A = SparseWithDenseRowColMatrix(S, U, V); M = Matrix(A)
+    b = randn(ComplexF64, n)
+    @test relerr(lstsq(A, b; alg = :structured), minnorm(M, b)) < 1.0e-9
+end
+
+@testset ":auto uses structured on nonsingular S, falls back on singular/near-singular S" begin
+    # nonsingular S: :auto path agrees with explicit :structured and :dense, all = pinv
+    A = rankdef_nonsingS(100, 4, 2; seed = 1)
+    M = Matrix(A); b = randn(100); xref = minnorm(M, b)
+    @test relerr(lstsq(A, b), xref) < 1.0e-9                       # :auto default
+    @test relerr(lstsq(A, b; alg = :auto), xref) < 1.0e-9
+    @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :structured)
+    @test lstsq(A, b; alg = :auto) ≈ lstsq(A, b; alg = :dense)
+
+    # exactly-singular S (zeroed row): structured klu throws → :auto falls back to dense
+    Asing = rankdef(80, 3; seed = 2)
+    Msing = Matrix(Asing); bs = randn(80)
+    @test relerr(lstsq(Asing, bs; alg = :auto), minnorm(Msing, bs)) < 1.0e-9
+    @test_throws ArgumentError lstsq(Asing, bs; alg = :structured)
+
+    # near-singular S (κ(S) ≈ 5e13, but A well-conditioned): structured would be silently wrong;
+    # :auto must fall back to the exact dense COD and :structured must error (not return garbage)
+    n = 80; rng = MersenneTwister(3)
+    Snear = spdiagm(0 => collect(exp.(range(0, -log(4.7e13); length = n))), -1 => fill(-1.0e-3, n - 1))
+    Anear = SparseWithDenseRowColMatrix(Snear, randn(rng, n, 2), randn(rng, 2, n))
+    Mnear = Matrix(Anear); bn = randn(n)
+    @test cond(Mnear) < 1.0e8                                      # A itself is well-conditioned
+    @test relerr(lstsq(Anear, bn; alg = :auto), minnorm(Mnear, bn)) < 1.0e-7
+    @test_throws ArgumentError lstsq(Anear, bn; alg = :structured)
+end
+
+@testset "structured rejects Tikhonov λ" begin
+    A = rankdef_nonsingS(40, 2, 1; seed = 1)
+    @test_throws ArgumentError lstsq(A, randn(40); alg = :structured, λ = 0.1)
+end
+
 @testset "iterative engine recovers the same minimum-norm solution" begin
     for seed in 1:3, solver in (:lsqr, :lsmr)
         A = rankdef(80, 3; seed = seed)

From 4abc4fc3347118dbd3fbe5e1474bc19c54764c68 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas-Claude <accounts@chrisrackauckas.com>
Date: Sun, 31 May 2026 08:52:56 -0400
Subject: [PATCH 3/3] lstsq: peel a selector-singular S so the structured
 direct engine handles the BVP case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When S is singular because its boundary rows are zeroed and supplied by a SelectorMatrix U
(the `replace=true` / almost-banded case), the structured engine previously fell back to the
dense COD. Add a sparse "peel": A = S + UV = (S + U Uᴴ) + U(V − Uᴴ), and for a selector U the
term U Uᴴ is just r diagonal entries, so S̃ = S + U Uᴴ stays SPARSE and is typically nonsingular
(the zeroed rows get a unit diagonal back), with the rank unchanged. The capacitance method
then applies to (S̃, U, V − Uᴴ) — a structure-exploiting DIRECT solve, no densifying.

Verified against pinv(Matrix(A))*b: ~2-3e-15 for A nonsingular AND rank-deficient, consistent
AND inconsistent b. A general singular S whose null space is NOT coordinate-aligned has a dense
de-singularizing correction (no sparse peel exists), so it still falls back to the exact dense
COD — a genuine mathematical limit, caught by `_peel_selector` returning nothing.

617/617 tests pass on Julia 1.10, 1.11, and 1.12. Runic-formatted, typos clean.

Co-Authored-By: Chris Rackauckas <accounts@chrisrackauckas.com>
---
 README.md          | 20 +++++++++++------
 src/lstsq.jl       | 54 +++++++++++++++++++++++++++++++++++++++-------
 test/test_lstsq.jl | 26 ++++++++++++++++++++++
 3 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index d6ec5f5..e9c63d8 100644
--- a/README.md
+++ b/README.md
@@ -191,21 +191,27 @@ Engines (`:auto` picks the first that applies):
   So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) +
   small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100×
   faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles
-  consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). Requires `S`
-  nonsingular and well-conditioned — it errors otherwise (where inverting `S` would be silently
-  wrong), so `:auto` falls back to `:dense` there.
+  consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). A **singular
+  `S` with coordinate-aligned null structure** — the boundary-condition / `replace=true` case,
+  where the zeroed rows are supplied by a `SelectorMatrix` `U` — is handled by a sparse *peel*
+  `S̃ = S + U Uᴴ` (just `r` diagonal entries, so `S̃` stays sparse and is nonsingular) with
+  `A = S̃ + U(V − Uᴴ)`, keeping the direct structure-exploiting path. It errors only when `S` is
+  near-singular or singular with a *general* (dense) null space — where inverting `S` would be
+  silently wrong — so `:auto` falls back to `:dense` there.
 * **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact
   (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback
-  when `S` is singular/near-singular (the structured route inverts `S`, so it can't apply there).
+  when `S` is near-singular, or singular with a null space that is *not* coordinate-aligned (so
+  the sparse peel does not apply and there is no structure-exploiting direct route — the
+  null-space correction would itself be dense).
 * **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never
   forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor
   `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution;
   approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence.
   Lives in an extension (`using IterativeSolvers`).
 
-`:auto` (the default) uses `:structured` when `S` is nonsingular and well-conditioned,
-otherwise the exact `:dense` COD — so you get the fast structure-exploiting direct solve when it
-is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are
+`:auto` (the default) uses `:structured` when `S` is nonsingular (or singular with a
+coordinate-aligned null space) and well-conditioned, otherwise the exact `:dense` COD — so you
+get the fast structure-exploiting direct solve when it is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are
 supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured
 kernel and is useful on its own.
 
diff --git a/src/lstsq.jl b/src/lstsq.jl
index 719ae0d..61c282c 100644
--- a/src/lstsq.jl
+++ b/src/lstsq.jl
@@ -103,10 +103,51 @@ function lstsq(A::SparseWithDenseRowColMatrix, b::AbstractVector; alg::Symbol =
     end
 end
 
+# Sparse "peel" for a SINGULAR S whose null structure is coordinate-aligned (the BVP /
+# boundary-condition case: `replace=true` zeros the selector rows of S, U = [I_r; 0]). The
+# identity A = S + U V = (S + U Uᴴ) + U(V − Uᴴ) re-splits A with S̃ = S + U Uᴴ; for a selector U,
+# U Uᴴ is just `r` diagonal entries, so S̃ stays SPARSE and is typically nonsingular (the zeroed
+# rows get a unit diagonal back), and the rank is unchanged. The structured method then applies
+# to (S̃, U, V − Uᴴ). For a dense U, U Uᴴ is dense (S̃ dense) so no sparse peel exists — return
+# `nothing` and let the caller fall back to the dense COD.
+_peel_selector(S, U, V) = nothing
+function _peel_selector(S::SparseMatrixCSC{T}, U::SelectorMatrix, V::AbstractMatrix{T}) where {T}
+    r = U.r
+    r == 0 && return nothing
+    n = size(S, 1)
+    Stilde = S + sparse(1:r, 1:r, ones(T, r), n, n)        # S̃ = S + U Uᴴ  (sparse)
+    Vtilde = copy(V)
+    @inbounds for k in 1:r
+        Vtilde[k, k] -= one(T)                              # Ṽ = V − Uᴴ
+    end
+    return (Stilde, Vtilde)
+end
+
+# Factor S; on (exact) singularity try the sparse selector-peel and factor S̃ instead. Returns
+# `(Sfact, S_effective, V_effective)` or `nothing` (caller falls back to the dense COD).
+function _klu_or_peel(Sown::SparseMatrixCSC, U, Vmat)
+    try
+        return (PureKLU.klu(Sown), Sown, Vmat)
+    catch e
+        e isa SingularException || rethrow(e)
+    end
+    peeled = _peel_selector(Sown, U, Vmat)
+    peeled === nothing && return nothing
+    Stilde, Vtilde = peeled
+    try
+        return (PureKLU.klu(Stilde), Stilde, Vtilde)
+    catch e
+        e isa SingularException || rethrow(e)
+        return nothing                                   # S̃ still singular → dense fallback
+    end
+end
+
 # Structure-exploiting DIRECT min-norm least-squares (returns `(x, ok)`; `ok=false` signals the
 # caller to fall back to dense because S is singular/near-singular). With S nonsingular,
 # A = S(I + ZV), Z = S⁻¹U; the rank deficiency lives in C = I + V Z (nullity(A)=nullity(C)).
 # A⁺b is assembled from one PureKLU factorization of S + small dense SVD/QR — A is never formed.
+# A SINGULAR S with coordinate-aligned null structure (selector U) is first peeled to a sparse
+# nonsingular S̃ (see `_peel_selector`); a general singular S falls back to the dense COD.
 function _lstsq_structured(
         A::SparseWithDenseRowColMatrix, b::AbstractVector;
         tolC::Real = -one(real(float(eltype(A)))), λ::Real = 0,
@@ -126,12 +167,10 @@ function _lstsq_structured(
     tol = tolC < 0 ? sqrt(eps(rT)) : rT(tolC)
 
     Sown = _own_sparse(TT, A.S)
-    Sfact = try
-        PureKLU.klu(Sown)
-    catch e
-        e isa SingularException || rethrow(e)
-        return (zeros(TT, n), false)                # S exactly singular → fall back to dense
-    end
+    Vmat = r > 0 ? Matrix{TT}(A.V) : Matrix{TT}(undef, 0, n)
+    fac = _klu_or_peel(Sown, A.U, Vmat)             # factor S, or peel a selector-singular S to S̃
+    fac === nothing && return (zeros(TT, n), false) # S (and any peel) singular → fall back to dense
+    Sfact, Sown, V = fac                            # S, Sown, V are the *effective* (possibly peeled) ones
 
     if r == 0                                       # A = S nonsingular ⇒ the solution is S⁻¹b
         x = collect(TT, b)
@@ -139,11 +178,10 @@ function _lstsq_structured(
         return (x, true)
     end
 
-    V = Matrix{TT}(A.V)                             # r × n
     Z = Matrix{TT}(undef, n, r)
     materialize_U!(Z, A.U)                          # Z = U
     nrmU = norm(Z)
-    PureKLU.solve!(Sfact, Z)                        # Z = S⁻¹U
+    PureKLU.solve!(Sfact, Z)                        # Z = S̃⁻¹U
 
     # Guard: near-singular S makes Z = S⁻¹U huge/garbage and `klu` does NOT throw, which both
     # crashes the SVDs and gives a silently-wrong answer. κ̂(S) = ‖S‖₁·‖Z‖/‖U‖ ≳ κ(S); bail early.
diff --git a/test/test_lstsq.jl b/test/test_lstsq.jl
index 98b0f0a..10312b4 100644
--- a/test/test_lstsq.jl
+++ b/test/test_lstsq.jl
@@ -105,6 +105,32 @@ end
     @test relerr(lstsq(A, b; alg = :structured), minnorm(M, b)) < 1.0e-9
 end
 
+@testset "structured engine peels a selector-singular S (BVP boundary case)" begin
+    # S has its boundary rows zeroed (singular); U is a SelectorMatrix supplying them. The sparse
+    # peel S̃ = S + U Uᴴ (r diagonal entries) keeps the structured DIRECT path — no dense fallback —
+    # and stays exact, for A nonsingular AND rank-deficient, consistent AND inconsistent.
+    for seed in 1:3, rde in (false, true)
+        n, r = 100, 3
+        S = spdiagm(0 => fill(3.0, n), 1 => fill(-1.0, n - 1), -1 => fill(-1.0, n - 1))
+        S[1:r, :] .= 0.0; dropzeros!(S)
+        F = zeros(r, n)
+        for i in 1:r
+            F[i, i] = 1.0; F[i, i + r] = 0.3
+        end
+        rde && (F[r, :] .= F[1, :])                  # make A itself rank-deficient
+        A = SparseWithDenseRowColMatrix(S, F; replace = true)
+        M = Matrix(A); N = nullspace(M)
+        @test rank(M) == n - (rde ? 1 : 0)
+        for (kind, b) in ((:cons, M * randn(n)), (:incons, randn(n)))
+            xref = minnorm(M, b)
+            x = lstsq(A, b; alg = :structured)       # must NOT throw — the peel succeeds
+            @test relerr(x, xref) < 1.0e-9
+            @test norm(x) ≈ norm(xref) rtol = 1.0e-8
+            isempty(N) || @test norm(N' * x) < 1.0e-8
+        end
+    end
+end
+
 @testset ":auto uses structured on nonsingular S, falls back on singular/near-singular S" begin
     # nonsingular S: :auto path agrees with explicit :structured and :dense, all = pinv
     A = rankdef_nonsingS(100, 4, 2; seed = 1)