SciML · ChrisRackauckas · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -29,10 +29,10 @@ jobs:
         with:
           version: ${{ matrix.version }}
       - uses: julia-actions/cache@v2
-      # PureKLU is not yet registered in General; develop it from the SciML repo so the
-      # dependency resolves on every Julia version (`[sources]` in Project.toml covers 1.11+,
-      # but this keeps 1.10 green too). Remove once PureKLU is registered.
-      - name: Develop unregistered PureKLU dependency
-        run: julia --color=yes --project=. -e 'using Pkg; Pkg.develop(url="https://github.com/SciML/PureKLU.jl")'
+      # PureKLU and SparseColumnPivotedQR are not yet registered in General; develop them from
+      # the SciML repos so the dependencies resolve on every Julia version. Remove each once it
+      # is registered.
+      - name: Develop unregistered SciML dependencies
+        run: julia --color=yes --project=. -e 'using Pkg; Pkg.develop([PackageSpec(url="https://github.com/SciML/PureKLU.jl"), PackageSpec(url="https://github.com/SciML/SparseColumnPivotedQR.jl")])'
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
diff --git a/Project.toml b/Project.toml
@@ -8,29 +8,37 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 PureKLU = "0c0d3e7f-3a8b-4f7e-b6f1-9a4d2e7c1f01"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+SparseColumnPivotedQR = "a57abbd0-fea5-4d57-96be-5e525945e8e4"
+SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
 
 [weakdeps]
+IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 
 [extensions]
+SparseWithDenseRowColMatricesIterativeSolversExt = "IterativeSolvers"
 SparseWithDenseRowColMatricesLinearSolveExt = "LinearSolve"
 
 [compat]
 Aqua = "0.8"
 ForwardDiff = "0.10, 1"
+IterativeSolvers = "0.9"
 LinearAlgebra = "1"
 LinearSolve = "3"
 PrecompileTools = "1"
 PureKLU = "0.1"
 Random = "1"
 SafeTestsets = "0.1"
 SparseArrays = "1"
+SparseColumnPivotedQR = "0.1"
+SparseMatricesCSR = "0.6"
 Test = "1"
 julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -39,4 +47,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "ForwardDiff", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"]
+test = ["Aqua", "ForwardDiff", "IterativeSolvers", "LinearAlgebra", "LinearSolve", "Random", "SafeTestsets", "SparseArrays", "Test"]
diff --git a/README.md b/README.md
@@ -106,6 +106,131 @@ The `refactor!` + `\` loop is allocation-free apart from the tiny `O(r)` dense p
 * **`:auto`** (default) — Woodbury, automatically falling back to the augmented system if
   `S` is singular or `C` is ill-conditioned.
 
+### QR (the numerically stable path)
+
+`qr(A; strategy=:augmented)` factors the bordered system `[S U; V -I]` with
+[**SparseColumnPivotedQR**](https://github.com/SciML/SparseColumnPivotedQR.jl) (a pure-Julia,
+rank-revealing, column-pivoted sparse QR — the QR analogue of PureKLU), returning a
+`SparseWithDenseRowColQRAugmented`. Like the augmented LU path it never forms `S⁻¹`, so it stays
+accurate when the sparse part `S` is ill-conditioned or nearly singular but the dense rows
+regularize `A` — the
+[FastAlmostBandedMatrices](https://github.com/SciML/FastAlmostBandedMatrices.jl) regime, where
+QR is the conventional choice. Measured forward error on `A = S + U V` with `κ(A) ≈ 6` as
+`κ(S)` sweeps from `3.6e6` to `3.6e14` (reference: BigFloat dense solve):
+
+```
+  κ(S)          qr (augmented)   factorize :woodbury, refine=2   factorize :augmented (LU)
+  3.6e6   …   3.1e-16 (flat)        8.7e-17  →  4.5e-7              1.2e-16 (flat)
+  3.6e14
+```
+
+The Woodbury path's error grows with `κ(S)·κ(C)`; here (where the *assembled* `A` stays
+well-conditioned, `κ(A) ≈ 6`) both augmented paths hold at the noise floor.
+
+**Be clear about what QR does and does not buy you.** Augmented QR is *not* more accurate than
+the augmented **LU** path. It is backward-stable, so its forward error tracks `κ(A)·eps`; when
+the *assembled* `A` is itself ill-conditioned the augmented-LU path's partial pivoting can be
+**more** accurate on these structured matrices (measured: LU near the noise floor where QR sits
+at `κ(A)·eps`, orders apart for `κ(A) ≳ 1e6`). QR's genuine, distinct properties versus
+augmented-LU are that it is **rank-revealing** (a clean numerical-rank/singularity signal) and
+that it is the QR-based interface users of almost-banded solvers expect — not higher accuracy,
+which the LU path already delivers, and not throughput (it is slower; see the trade-offs
+below). Reach for `qr` when you specifically want a QR factorization or its rank diagnostics;
+otherwise `factorize`/`lu` is the better default.
+
+There is **no** `strategy=:woodbury` for `qr`: a Woodbury-over-`qr(S)` solve shares the same
+`κ(S)·κ(C)` cancellation and is catastrophically inaccurate on ill-conditioned `S` (~`1e-1`),
+so it is deliberately not offered. Because the backend is genuinely rank-revealing (column
+pivoting), `qr` throws `SingularException` exactly when the bordered system is rank-deficient;
+a merely ill-conditioned, accurately-solvable `A` is **not** rejected.
+
+The backend gives the QR path the same engineering niceties as the LU path: the solve is
+**allocation-free**, [`refactor!`](@ref)/`qr!` **reuses the symbolic analysis** (column ordering
+/ elimination tree) for a fixed sparsity pattern — so `qr` is usable in a Newton / time-stepping
+hot loop, not only for one-off stable solves — and **any element type** works (the BLAS floats
+plus generic numbers such as `BigFloat`/`ForwardDiff.Dual`, on every Julia version). Cost
+(banded interior + `r` dense rows, factor / solve / refactor, ms):
+
+```
+  n=2000 r=4                          n=5000 r=8
+  qr  (augmented QR) : 89 / 2.8 / 60    620 / 15 / 450    ← rank-revealing; alloc-free solve
+  lu  (augmented LU) : 54 / 1.7 / 18    610 / 11 / 180    ← klu! symbolic reuse on refactor
+  lu  (Woodbury)     : 1.3/ 0.1 / n/a   3.6 / 0.3 / n/a   ← fast path (forms S⁻¹), ~50–60×
+```
+
+QR is now in the same ballpark as the augmented-LU path (a bit slower on factor/refactor;
+allocation-free solve), though still far from the Woodbury fast path. Forward error is
+comparable (~`1e-13`) for all three on a well-conditioned `A`. Reach for `qr` when you want a
+genuinely rank-revealing factorization, generic-eltype support, or the QR interface; `factorize`
+/`lu` (Woodbury) remains the throughput default.
+
+## Rank-deficient / inconsistent least squares (`lstsq`)
+
+`\`, `factorize`/`lu`, and `qr` solve **nonsingular** systems and throw `SingularException`
+when `A` is singular. For a genuinely rank-deficient or inconsistent system, `lstsq(A, b)`
+returns the **minimum-norm least-squares solution** `x = A⁺b` (the Moore–Penrose solution: of
+all `x` minimizing `‖Ax − b‖₂`, the one with smallest `‖x‖₂`).
+
+```julia
+x = lstsq(A, b)                        # :auto — structured direct, dense fallback (default)
+x = lstsq(A, b; alg = :structured)     # structure-exploiting direct solve (never densifies A)
+x = lstsq(A, b; alg = :dense)          # exact dense complete-orthogonal decomposition (gelsy)
+x = lstsq(A, b; alg = :iterative)      # LSQR/LSMR (needs `using IterativeSolvers`)
+x = lstsq(A, b; λ = 1e-3)              # Tikhonov-damped (ridge) variant
+```
+
+This is a **separate, opt-in** entry point — `\`/`factorize`/`qr` stay exact and keep throwing
+on singular `A`; least squares never happens implicitly. The bordered/Woodbury machinery
+*cannot* produce `A⁺b`: solving `[S U; V -I][x;y]=[b;0]` in least squares minimizes a different
+objective than `‖Ax−b‖` (measured 3–65× too large a residual on rank-deficient inconsistent
+systems), and there is no Woodbury-style formula for the pseudoinverse of a sum.
+
+Engines (`:auto` picks the first that applies):
+
+* **`:structured`** — a **direct** solve that **never densifies `A`**. When `S` is nonsingular,
+  `A = S(I + ZV)` with `Z = S⁻¹U`, and the *entire* rank deficiency collapses into the small
+  `r × r` capacitance `C = I + V Z` (`nullity(A) = nullity(C)`, since `det A = det S · det C`).
+  So `A⁺b` is assembled from **one sparse factorization of `S`** (PureKLU) + `r` solves (`Z`) +
+  small dense SVD/QR on `r × r` and `n × s` (`s ≤ 2r`) blocks — `O(factor(S) + n·r²)`, **>100×
+  faster than `:dense`** for large `n` (measured 191× at `n = 2000`, `r = 6`). Handles
+  consistent *and* inconsistent `b` (a rank-`k` projection of `b` onto `range(A)`). A **singular
+  `S` with coordinate-aligned null structure** — the boundary-condition / `replace=true` case,
+  where the zeroed rows are supplied by a `SelectorMatrix` `U` — is handled by a sparse *peel*
+  `S̃ = S + U Uᴴ` (just `r` diagonal entries, so `S̃` stays sparse and is nonsingular) with
+  `A = S̃ + U(V − Uᴴ)`, keeping the direct structure-exploiting path. It errors only when `S` is
+  near-singular or singular with a *general* (dense) null space — where inverting `S` would be
+  silently wrong — so `:auto` falls back to `:dense` there.
+* **`:dense`** — complete-orthogonal decomposition (LAPACK `gelsy`) of the densified `A`. Exact
+  (matches `pinv(Matrix(A))*b` to machine precision), `O(n³)` / `O(n²)`. The mandatory fallback
+  when `S` is near-singular, or singular with a null space that is *not* coordinate-aligned (so
+  the sparse peel does not apply and there is no structure-exploiting direct route — the
+  null-space correction would itself be dense).
+* **`:iterative`** — LSQR (default) / LSMR driven by `A`'s structured matvec/adjoint, never
+  forming `A` — for `n` too large to densify *and* `S` singular (so neither `:structured` nor
+  `:dense` apply). Started from `x0 = 0` it converges to the same minimum-norm solution;
+  approximate (to `atol`/`btol`), fragile under ill-conditioning, and warns on non-convergence.
+  Lives in an extension (`using IterativeSolvers`).
+
+`:auto` (the default) uses `:structured` when `S` is nonsingular (or singular with a
+coordinate-aligned null space) and well-conditioned, otherwise the exact `:dense` COD — so you
+get the fast structure-exploiting direct solve when it is safe and the exact answer always. Only `Float32`/`Float64`/`ComplexF32`/`ComplexF64` are
+supported. Adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`) is computed by the same structured
+kernel and is useful on its own.
+
+For **repeated** least-squares solves with the same `A` (or a Newton / time-stepping loop),
+build the factorization once and reuse it — the sparse factorization of `S` and the small dense
+decompositions are cached, so each solve is a cheap back-solve:
+
+```julia
+F = SparseWithDenseRowColLeastSquares(A)   # structured setup once
+x1 = F \ b1;  ldiv!(x2, F, b2);  …          # each solve reuses S's factorization
+```
+
+Measured (`n = 2000`, `r = 6`): a reused solve is `0.05 ms` / `16 KB` versus the one-shot
+`lstsq(A, b)` at `2.1 ms` / `2.6 MB` — **~44× faster**, since it skips re-factoring `S`. (The
+structured `alg=:auto`/`:structured` path applies; for a general singular `S` the constructor
+errors and you use `lstsq(A, b; alg = :dense)`.)
+
 ## Benchmarks
 
 A banded sparse interior plus `r = 8` dense rows (`n = 5000`). Factoring only `S` and
@@ -138,11 +263,17 @@ globally-scattered `S` degrades every sparse method.)
 * [x] Matrix–vector / matrix–matrix multiply (`mul!` vector path is allocation-free)
 * [x] Factorization and linear solve (`factorize`/`lu`, `\`, `ldiv!`)
 * [x] In-place refactorization with a fixed sparsity pattern (`refactor!`)
-* [x] Adjoint / transpose solves
-* [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex)
-
-`qr` is intentionally not provided — a general sparse `QR` causes far more fill than the
-LU-based KLU path; use `factorize`/`lu`.
+* [x] Adjoint / transpose solves *and* structured adjoint/transpose matvec (`A'*u`, `Aᴴ`, `Aᵀ`)
+* [x] Generic element types (`BigFloat`, `ForwardDiff.Dual`, complex) — LU/Woodbury *and* QR
+* [x] Rank-revealing QR (`qr`) for ill-conditioned `S`: allocation-free solve, symbolic-reuse
+  `refactor!`, any element type (column-pivoted, pure-Julia backend)
+* [x] Minimum-norm least squares (`lstsq`) for rank-deficient / inconsistent `A` — a
+  structure-exploiting **direct** solve (capacitance method) when `S` is nonsingular, >100×
+  faster than densifying; exact dense COD fallback otherwise
+
+`qr(A)` builds the augmented sparse-QR factorization (above): a rank-revealing column-pivoted
+solve with an allocation-free `ldiv!` and a symbolic-reuse `refactor!`. It is comparable to the
+augmented-LU path in cost; `factorize`/`lu` (Woodbury) is still the throughput default.
 
 ## When is this the right tool?
 
@@ -248,17 +379,27 @@ not throw), again matching KLU. One caveat: once a singular `S` forces the augme
 the cache keeps reusing it even if later `S` values are nonsingular — `init` a fresh cache to
 get back the faster Woodbury path.
 
+For the stable QR path, pass `SparseWithDenseRowColQRFactorization(; check_pattern)` instead.
+It has no `strategy`/`refine` (only the augmented mode exists and augmented QR needs no
+refinement); a value update reuses the QR symbolic analysis (`refactor!`) just like the LU
+algorithm, and a singular `A` likewise returns `ReturnCode.Infeasible`. It is opt-in — the
+default algorithm stays the LU one.
+
 ## Public API
 
 ```
 SparseWithDenseRowColMatrix    SelectorMatrix
 sparsepart  fillpart  lowrankfactors  exclusive_sparsepart  denserank
-factorize   lu   \\   ldiv!   refactor!   lu!   update_lowrank!
-SparseWithDenseRowColWoodbury  SparseWithDenseRowColAugmented
+factorize   lu   qr   \\   ldiv!   refactor!   lu!   qr!   update_lowrank!
+lstsq   SparseWithDenseRowColLeastSquares   # minimum-norm least squares (one-shot; cached/reusable)
+SparseWithDenseRowColWoodbury  SparseWithDenseRowColAugmented  SparseWithDenseRowColQRAugmented
 recommend_lowrank_peel   PeelRecommendation
-SparseWithDenseRowColFactorization   # LinearSolve.jl algorithm (extension)
+SparseWithDenseRowColFactorization     # LinearSolve.jl algorithm (LU/Woodbury, default)
+SparseWithDenseRowColQRFactorization   # LinearSolve.jl algorithm (stable QR, opt-in)
 ```
 
+The `:iterative` engine of `lstsq` requires `using IterativeSolvers` (a package extension).
+
 ## Installation
 
 PureKLU.jl is not yet in the General registry; install it from the SciML repository:

diff --git a/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl b/ext/SparseWithDenseRowColMatricesIterativeSolversExt.jl
@@ -0,0 +1,53 @@
+module SparseWithDenseRowColMatricesIterativeSolversExt
+
+using SparseWithDenseRowColMatrices
+using SparseWithDenseRowColMatrices: SparseWithDenseRowColMatrix, _check_lstsq_eltype
+using IterativeSolvers
+using LinearAlgebra: LinearAlgebra
+
+# Iterative engine for `lstsq(A, b; alg=:iterative)`. LSQR (default) / LSMR are driven entirely
+# by A's structured matvec and adjoint (`mul!(y, A, x)` / `mul!(y, A', x)` defined in
+# src/matvec.jl), so the dense `A = S + U·V` is never formed. Started from x0 = 0, both
+# converge to the MINIMUM-NORM least-squares solution for rank-deficient / inconsistent systems
+# (the min-norm guarantee is contingent on the zero start — the solver is never given a guess).
+function SparseWithDenseRowColMatrices._lstsq_iterative(
+        A::SparseWithDenseRowColMatrix, b::AbstractVector;
+        solver::Symbol = :lsqr, atol::Real = 1.0e-12, btol::Real = 1.0e-12,
+        maxiters::Integer = 20 * size(A, 1), λ::Real = 0, verbose::Bool = false,
+        # accept and ignore the dense-only keyword so `alg` can be flipped without edits
+        rcond = nothing,
+    )
+    n = size(A, 1)
+    length(b) == n || throw(DimensionMismatch("A has $n columns, b has length $(length(b))"))
+    _check_lstsq_eltype(promote_type(eltype(A), eltype(b)))   # reject non-BLAS-float up front
+    λ ≥ 0 || throw(ArgumentError("λ (Tikhonov damping) must be ≥ 0; got $λ"))
+
+    x, ch = if solver === :lsqr
+        IterativeSolvers.lsqr(A, b; damp = λ, atol = atol, btol = btol, maxiter = maxiters, log = true)
+    elseif solver === :lsmr
+        IterativeSolvers.lsmr(A, b; λ = λ, atol = atol, btol = btol, maxiter = maxiters, log = true)
+    else
+        throw(ArgumentError("solver must be :lsqr or :lsmr; got :$solver"))
+    end
+
+    # IterativeSolvers' `isconverged` is unreliable here (lsqr reports success even when it
+    # stops at the iteration cap with a wrong answer), so flag on budget exhaustion — which
+    # cleanly separates converged from non-converged for both lsqr and lsmr — and report the
+    # true least-squares optimality ‖Aᴴ(Ax−b)‖ (one cheap structured adjoint matvec; →0 at a
+    # minimizer) rather than trusting the solver's own flag.
+    capped = ch.iters ≥ maxiters
+    verbose && @info "lstsq(alg=:iterative)" solver converged = !capped iters = ch.iters
+    if capped
+        aresid = LinearAlgebra.norm(A' * (A * x .- b))
+        @warn(
+            "lstsq(...; alg=:iterative, solver=:$solver) hit the $maxiters-iteration budget " *
+                "without converging — the result is NOT a reliable least-squares solution. " *
+                "Raise `maxiters`, or use the exact `alg=:dense` if `A` is small enough to densify.",
+            least_squares_residual = aresid,
+            iters = ch.iters,
+        )
+    end
+    return x
+end
+
+end # module