diff --git a/README.md b/README.md
index 3fa9761..976f8c6 100644
--- a/README.md
+++ b/README.md
@@ -25,15 +25,26 @@ pip install -e .
 ### Usage
 
 ```python
-from alsgls import als_gls, simulate_sur, nll_per_row, XB_from_Blist
+from alsgls import ALSGLS, ALSGLSSystem, simulate_sur
 
 Xs_tr, Y_tr, Xs_te, Y_te = simulate_sur(N_tr=240, N_te=120, K=60, p=3, k=4)
-B, F, D, mem, _ = als_gls(Xs_tr, Y_tr, k=4)
-Yhat_te = XB_from_Blist(Xs_te, B)
-nll = nll_per_row(Y_te - Yhat_te, F, D)
+
+# Scikit-learn style estimator
+est = ALSGLS(rank="auto", max_sweeps=12)
+est.fit(Xs_tr, Y_tr)
+test_score = est.score(Xs_te, Y_te)  # negative test NLL per observation
+
+# Statsmodels-style system interface
+system = {f"eq{j}": (Y_tr[:, j], Xs_tr[j]) for j in range(Y_tr.shape[1])}
+sys_model = ALSGLSSystem(system, rank="auto")
+sys_results = sys_model.fit()
+params = sys_results.params_as_series()  # pandas optional
 ```
 
-See `examples/compare_als_vs_em.py` for a complete ALS versus EM comparison.
+See `examples/compare_als_vs_em.py` for a complete ALS versus EM comparison. The
+`benchmarks/compare_sur.py` script contrasts ALS-GLS with `statsmodels` and
+`linearmodels` SUR implementations on matched simulation grids while recording
+peak memory (via Memray, Fil, or the POSIX RSS high-water mark).
 
 ### Documentation and notebooks
 
@@ -63,3 +74,24 @@ To show the magnitude, we ran a Monte‑Carlo experiment with N = 300 observat
 
 Statistically, the two estimators are indistinguishable (paired‑test p ≥ 0.14).  Computationally, ALS needs only a few megabytes whereas EM needs tens to hundreds.
 
+### Defaults, tuning knobs, and failure modes
+
+- **Rank (`k`)** – By default the high-level APIs pick `min(8, ceil(K / 10))`, a
+  conservative fraction of the number of equations. Increase `rank` if the
+  cross-equation correlation matrix is slow to decay; decrease it when the
+  diagonal dominates.
+- **ALS ridge terms (`lam_F`, `lam_B`)** – Defaults to `1e-3` for both the
+  latent-factor and regression updates; raise them slightly (e.g. `1e-2`) if CG
+  struggles to converge or the NLL trace plateaus early.
+- **Noise floor (`d_floor`)** – Keeps the diagonal component positive; the
+  default `1e-8` protects against breakdowns when an equation is nearly
+  deterministic. Increase it in highly ill-conditioned settings.
+- **Stopping criteria** – ALS stops when the relative drop in NLL per sweep is
+  below `1e-6` (configurable via `rel_tol`) or after `max_sweeps`. Inspect
+  `info["nll_trace"]` to diagnose stagnation.
+- **Possible failures** – Large condition numbers or nearly-collinear regressors
+  can make the β-step CG solve slow; adjust `cg_tol`/`cg_maxit`, add stronger
+  ridge, or re-scale predictors. If `info["accept_t"]` stays at zero and the
+  NLL does not improve, the factor rank may be too large relative to the sample
+  size.
+
diff --git a/alsgls/__init__.py b/alsgls/__init__.py
index a1f15da..6606a64 100644
--- a/alsgls/__init__.py
+++ b/alsgls/__init__.py
@@ -1,7 +1,19 @@
 from .als import als_gls
+from .api import ALSGLS, ALSGLSSystem, ALSGLSSystemResults
 from .em import em_gls
 from .metrics import mse, nll_per_row
-from .sim import simulate_sur, simulate_gls
 from .ops import XB_from_Blist
+from .sim import simulate_gls, simulate_sur
 
-__all__ = ["als_gls", "em_gls", "mse", "nll_per_row", "simulate_sur", "simulate_gls", "XB_from_Blist"]
+__all__ = [
+    "ALSGLS",
+    "ALSGLSSystem",
+    "ALSGLSSystemResults",
+    "XB_from_Blist",
+    "als_gls",
+    "em_gls",
+    "mse",
+    "nll_per_row",
+    "simulate_gls",
+    "simulate_sur",
+]
diff --git a/alsgls/als.py b/alsgls/als.py
index 9e75eee..c0b5d07 100644
--- a/alsgls/als.py
+++ b/alsgls/als.py
@@ -24,6 +24,7 @@ def als_gls(
     *,
     scale_correct: bool = True,
     scale_floor: float = 1e-8,
+    rel_tol: float = 1e-6,
 ):
     """
     Alternating-least-squares GLS with low-rank-plus-diagonal covariance.
@@ -51,6 +52,7 @@ def als_gls(
     cg_tol : float                CG relative tolerance
     scale_correct : bool          if True, try guarded MLE scale fix on Σ each sweep
     scale_floor : float           min scalar for scale correction
+    rel_tol : float               relative NLL improvement threshold for early stopping
 
     Returns
     -------
@@ -81,6 +83,8 @@ def als_gls(
         raise ValueError(f"k must be between 1 and min(K={K}, N={N})")
     if lam_F < 0 or lam_B < 0:
         raise ValueError("Regularization parameters must be non-negative")
+    if rel_tol < 0:
+        raise ValueError("rel_tol must be non-negative")
 
     p_list = [X.shape[1] for X in Xs]
 
@@ -242,7 +246,7 @@ def try_with_scale(F_try, D_try):
         # Convergence: stop if relative improvement w.r.t previous post-Σ NLL is tiny
         rel_impr = (nll_prev - nll_curr) / max(1.0, abs(nll_prev))
         nll_prev = nll_curr
-        if rel_impr < 1e-6:
+        if rel_impr < rel_tol:
             break
 
     # Memory estimate: F (K×k) + D (K) + U (N×k) doubles
diff --git a/alsgls/api.py b/alsgls/api.py
new file mode 100644
index 0000000..6ee4bda
--- /dev/null
+++ b/alsgls/api.py
@@ -0,0 +1,345 @@
+"""High-level estimator APIs for ALS-based GLS fitting."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
+
+import numpy as np
+
+from .als import als_gls
+from .metrics import nll_per_row
+from .ops import XB_from_Blist
+
+
+def _auto_rank(num_equations: int) -> int:
+    """Heuristic rank used when the user does not provide one."""
+
+    if num_equations <= 0:
+        raise ValueError("num_equations must be positive")
+    # Cap the rank to avoid chasing noise; allow moderate growth with K.
+    return max(1, min(8, int(np.ceil(num_equations / 10))))
+
+
+def _asarray_2d(x: Any, *, dtype: np.dtype = np.float64) -> np.ndarray:
+    """Convert array-like input to a 2D ``numpy.ndarray``."""
+
+    if hasattr(x, "to_numpy"):
+        arr = x.to_numpy()
+    else:
+        arr = np.asarray(x)
+    arr = np.asarray(arr, dtype=dtype)
+    if arr.ndim == 1:
+        arr = arr[:, None]
+    if arr.ndim != 2:
+        raise ValueError("Input must be convertible to a 2D array")
+    return arr
+
+
+def _column_names(obj: Any, size: int) -> List[str]:
+    if hasattr(obj, "columns"):
+        return list(obj.columns)
+    if hasattr(obj, "dtype") and getattr(obj.dtype, "names", None):
+        return list(obj.dtype.names)
+    return [f"x{i}" for i in range(size)]
+
+
+def _eq_name(name: Any, index: int) -> str:
+    return str(name) if name is not None else f"eq{index}"
+
+
+class ALSGLS:
+    """Scikit-learn style estimator for low-rank GLS via ALS."""
+
+    def __init__(
+        self,
+        *,
+        rank: Optional[int | str] = "auto",
+        lam_F: float = 1e-3,
+        lam_B: float = 1e-3,
+        max_sweeps: int = 12,
+        rel_tol: float = 1e-6,
+        d_floor: float = 1e-8,
+        cg_maxit: int = 800,
+        cg_tol: float = 3e-7,
+        scale_correct: bool = True,
+        scale_floor: float = 1e-8,
+    ) -> None:
+        self.rank = rank
+        self.lam_F = lam_F
+        self.lam_B = lam_B
+        self.max_sweeps = max_sweeps
+        self.rel_tol = rel_tol
+        self.d_floor = d_floor
+        self.cg_maxit = cg_maxit
+        self.cg_tol = cg_tol
+        self.scale_correct = scale_correct
+        self.scale_floor = scale_floor
+
+    # ------------------------------------------------------------------
+    # Scikit-learn estimator protocol
+    # ------------------------------------------------------------------
+    def get_params(self, deep: bool = True) -> Dict[str, Any]:  # noqa: D401 - sklearn API
+        return {
+            "rank": self.rank,
+            "lam_F": self.lam_F,
+            "lam_B": self.lam_B,
+            "max_sweeps": self.max_sweeps,
+            "rel_tol": self.rel_tol,
+            "d_floor": self.d_floor,
+            "cg_maxit": self.cg_maxit,
+            "cg_tol": self.cg_tol,
+            "scale_correct": self.scale_correct,
+            "scale_floor": self.scale_floor,
+        }
+
+    def set_params(self, **params: Any) -> "ALSGLS":  # noqa: D401 - sklearn API
+        for key, value in params.items():
+            if not hasattr(self, key):
+                raise ValueError(f"Unknown parameter {key!r}")
+            setattr(self, key, value)
+        return self
+
+    # ------------------------------------------------------------------
+    # Fitting / inference
+    # ------------------------------------------------------------------
+    def fit(self, Xs: Sequence[Any], Y: Any) -> "ALSGLS":
+        X_list = [_asarray_2d(X) for X in Xs]
+        Y_arr = _asarray_2d(Y)
+
+        N, K = Y_arr.shape
+        if len(X_list) != K:
+            raise ValueError(f"Received {len(X_list)} design matrices for {K} equations")
+
+        for j, X in enumerate(X_list):
+            if X.shape[0] != N:
+                raise ValueError(f"X[{j}] has {X.shape[0]} rows but Y has {N}")
+
+        if self.rank == "auto" or self.rank is None:
+            k = _auto_rank(K)
+        else:
+            k = int(self.rank)
+
+        if not (1 <= k <= min(K, N)):
+            raise ValueError(f"rank must be in [1, min(K={K}, N={N})]")
+
+        B_list, F, D, mem_mb, info = als_gls(
+            X_list,
+            Y_arr,
+            k=k,
+            lam_F=self.lam_F,
+            lam_B=self.lam_B,
+            sweeps=self.max_sweeps,
+            d_floor=self.d_floor,
+            cg_maxit=self.cg_maxit,
+            cg_tol=self.cg_tol,
+            scale_correct=self.scale_correct,
+            scale_floor=self.scale_floor,
+            rel_tol=self.rel_tol,
+        )
+
+        self.B_list_ = B_list
+        self.F_ = F
+        self.D_ = D
+        self.mem_mb_est_ = mem_mb
+        self.info_ = info
+        self.n_features_in_ = tuple(X.shape[1] for X in X_list)
+        self.n_targets_ = K
+        self.n_obs_ = N
+        self.rank_ = k
+        self.training_residuals_ = Y_arr - XB_from_Blist(X_list, B_list)
+        self.is_fitted_ = True
+        return self
+
+    def _ensure_fitted(self) -> None:
+        if not getattr(self, "is_fitted_", False):
+            raise RuntimeError("The estimator has not been fitted yet")
+
+    def predict(self, Xs: Sequence[Any]) -> np.ndarray:
+        self._ensure_fitted()
+        X_list = [_asarray_2d(X) for X in Xs]
+        if len(X_list) != len(self.B_list_):
+            raise ValueError("Number of design matrices does not match fitted model")
+        for j, (X, B) in enumerate(zip(X_list, self.B_list_)):
+            if X.shape[1] != B.shape[0]:
+                raise ValueError(f"X[{j}] has {X.shape[1]} columns but expected {B.shape[0]}")
+        return XB_from_Blist(X_list, self.B_list_)
+
+    def score(self, Xs: Sequence[Any], Y: Any) -> float:
+        self._ensure_fitted()
+        Y_arr = _asarray_2d(Y)
+        if Y_arr.shape[1] != self.n_targets_:
+            raise ValueError("Y has incompatible number of targets")
+        preds = self.predict(Xs)
+        if preds.shape != Y_arr.shape:
+            raise ValueError("Predictions and Y have incompatible shapes")
+        residual = Y_arr - preds
+        return -float(nll_per_row(residual, self.F_, self.D_))
+
+
+@dataclass
+class _SystemEquation:
+    name: str
+    y: np.ndarray
+    X: np.ndarray
+    column_names: List[str]
+
+
+class ALSGLSSystem:
+    """Statsmodels-style system container for ALS GLS fitting."""
+
+    def __init__(
+        self,
+        system: Mapping[Any, Tuple[Any, Any]] | Sequence[Tuple[Any, Tuple[Any, Any]]],
+        *,
+        rank: Optional[int | str] = "auto",
+        lam_F: float = 1e-3,
+        lam_B: float = 1e-3,
+        max_sweeps: int = 12,
+        rel_tol: float = 1e-6,
+        d_floor: float = 1e-8,
+        cg_maxit: int = 800,
+        cg_tol: float = 3e-7,
+        scale_correct: bool = True,
+        scale_floor: float = 1e-8,
+    ) -> None:
+        if isinstance(system, Mapping):
+            items = list(system.items())
+        else:
+            items = list(system)
+        if len(items) == 0:
+            raise ValueError("system must contain at least one equation")
+
+        equations: List[_SystemEquation] = []
+        n_obs: Optional[int] = None
+
+        for idx, (name, (y, X)) in enumerate(items):
+            y_arr = _asarray_2d(y)
+            X_arr = _asarray_2d(X)
+            if y_arr.shape[1] != 1:
+                raise ValueError("Each equation's response must be 1D")
+            y_arr = y_arr.reshape(-1, 1)
+            if n_obs is None:
+                n_obs = y_arr.shape[0]
+            elif y_arr.shape[0] != n_obs:
+                raise ValueError("All equations must share the same number of rows")
+            if X_arr.shape[0] != n_obs:
+                raise ValueError("Design matrix rows must match the response length")
+            equations.append(
+                _SystemEquation(
+                    name=_eq_name(name, idx),
+                    y=y_arr,
+                    X=X_arr,
+                    column_names=_column_names(X, X_arr.shape[1]),
+                )
+            )
+
+        self._equations = equations
+        self.rank = rank
+        self.lam_F = lam_F
+        self.lam_B = lam_B
+        self.max_sweeps = max_sweeps
+        self.rel_tol = rel_tol
+        self.d_floor = d_floor
+        self.cg_maxit = cg_maxit
+        self.cg_tol = cg_tol
+        self.scale_correct = scale_correct
+        self.scale_floor = scale_floor
+
+    @property
+    def nobs(self) -> int:
+        return self._equations[0].y.shape[0]
+
+    @property
+    def keqs(self) -> int:
+        return len(self._equations)
+
+    def as_arrays(self) -> Tuple[List[np.ndarray], np.ndarray]:
+        Xs = [eq.X for eq in self._equations]
+        Y = np.column_stack([eq.y for eq in self._equations])
+        return Xs, Y
+
+    def fit(self) -> "ALSGLSSystemResults":
+        estimator = ALSGLS(
+            rank=self.rank,
+            lam_F=self.lam_F,
+            lam_B=self.lam_B,
+            max_sweeps=self.max_sweeps,
+            rel_tol=self.rel_tol,
+            d_floor=self.d_floor,
+            cg_maxit=self.cg_maxit,
+            cg_tol=self.cg_tol,
+            scale_correct=self.scale_correct,
+            scale_floor=self.scale_floor,
+        )
+        Xs, Y = self.as_arrays()
+        estimator.fit(Xs, Y)
+        self.estimator_ = estimator
+        result = ALSGLSSystemResults(self, estimator)
+        self.result_ = result
+        return result
+
+
+class ALSGLSSystemResults:
+    """Lightweight results container mimicking ``statsmodels`` outputs."""
+
+    def __init__(self, model: ALSGLSSystem, estimator: ALSGLS) -> None:
+        self.model = model
+        self.estimator = estimator
+        Xs, Y = model.as_arrays()
+
+        flattened = [b.ravel() for b in estimator.B_list_ if b.size]
+        self.params = np.concatenate(flattened) if flattened else np.empty(0)
+        self.param_labels = [
+            (eq.name, col)
+            for eq in model._equations
+            for col in eq.column_names
+        ]
+        self.B_list = estimator.B_list_
+        self.F = estimator.F_
+        self.D = estimator.D_
+        self.mem_mb_est = estimator.mem_mb_est_
+        self.info = estimator.info_
+        self.rank = estimator.rank_
+
+        self.fittedvalues = XB_from_Blist(Xs, self.B_list)
+        self.resids = Y - self.fittedvalues
+        self.nll_per_row = float(nll_per_row(self.resids, self.F, self.D))
+        self.loglike = -self.nll_per_row * self.model.nobs
+
+    def params_as_series(self):
+        try:
+            import pandas as pd  # type: ignore
+        except ImportError as exc:  # pragma: no cover - optional dependency
+            raise RuntimeError("pandas is required for params_as_series()") from exc
+
+        mi = pd.MultiIndex.from_tuples(self.param_labels, names=["equation", "variable"])
+        return pd.Series(self.params, index=mi)
+
+    def predict(self, exog: Optional[Mapping[Any, Any] | Sequence[Any]] = None) -> np.ndarray:
+        if exog is None:
+            Xs = [eq.X for eq in self.model._equations]
+        else:
+            if isinstance(exog, Mapping):
+                items = [exog[eq.name] for eq in self.model._equations]
+            else:
+                items = list(exog)
+            if len(items) != len(self.model._equations):
+                raise ValueError("Expected design matrices for all equations")
+            Xs = []
+            for item, eq in zip(items, self.model._equations):
+                arr = _asarray_2d(item)
+                if arr.shape[1] != eq.X.shape[1]:
+                    raise ValueError("Design matrix has incompatible number of columns")
+                Xs.append(arr)
+        return XB_from_Blist(Xs, self.B_list)
+
+    def summary_dict(self) -> Dict[str, Any]:
+        return {
+            "rank": self.rank,
+            "nobs": self.model.nobs,
+            "keqs": self.model.keqs,
+            "mem_mb_est": self.mem_mb_est,
+            "nll_per_row": self.nll_per_row,
+            "loglike": self.loglike,
+        }
diff --git a/benchmarks/compare_sur.py b/benchmarks/compare_sur.py
new file mode 100644
index 0000000..bf7f1db
--- /dev/null
+++ b/benchmarks/compare_sur.py
@@ -0,0 +1,260 @@
+"""Benchmark ALS-GLS against statsmodels and linearmodels SUR implementations."""
+
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import math
+import tempfile
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from alsgls import ALSGLS, nll_per_row
+from alsgls.ops import XB_from_Blist
+
+
+def _simulate_sur(N_tr: int, N_te: int, K: int, p: int, k: int, seed: int):
+    rng = np.random.default_rng(seed)
+    N = N_tr + N_te
+    Xs = [rng.standard_normal((N, p)) for _ in range(K)]
+    B = [rng.standard_normal((p, 1)) for _ in range(K)]
+    F = rng.standard_normal((K, k)) / np.sqrt(max(K, 1))
+    D = 0.2 + 0.3 * rng.random(K)
+    Z = rng.standard_normal((N, k))
+    Y = XB_from_Blist(Xs, B) + Z @ F.T + rng.standard_normal((N, K)) * np.sqrt(D)[None, :]
+    return (
+        [X[:N_tr] for X in Xs],
+        Y[:N_tr],
+        [X[N_tr:] for X in Xs],
+        Y[N_tr:],
+        B,
+        F,
+        D,
+    )
+
+
+def _stack_beta(B_list: Iterable[np.ndarray]) -> np.ndarray:
+    blocks = [np.asarray(b).ravel() for b in B_list]
+    return np.concatenate(blocks) if blocks else np.empty(0)
+
+
+def _gaussian_nll(residuals: np.ndarray, sigma: np.ndarray) -> float:
+    sign, logdet = np.linalg.slogdet(sigma)
+    if sign <= 0:
+        raise ValueError("Covariance matrix is not SPD")
+    inv = np.linalg.inv(sigma)
+    quad = np.sum(residuals @ inv * residuals) / residuals.shape[0]
+    return 0.5 * (quad + logdet + residuals.shape[1] * math.log(2.0 * math.pi))
+
+
+def _maybe_memray_runner(func, *args, **kwargs):
+    backend = kwargs.pop("backend", "memray")
+    if backend == "none":
+        return func(*args, **kwargs), None
+
+    if backend == "memray":
+        try:
+            import memray
+        except ImportError:
+            raise RuntimeError("memray is not installed; install memray or use --memory-backend none")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = Path(tmpdir) / "run.bin"
+            with memray.Tracker(path):
+                result = func(*args, **kwargs)
+            reader = memray.FileReader(path)
+            peak = reader.metadata.peak_memory
+            return result, peak
+
+    if backend == "fil":
+        raise RuntimeError(
+            "Fil profiler integration requires running this script via `fil-profile`; "
+            "re-run with --memory-backend none when invoking through fil."
+        )
+
+    if backend == "resource":
+        import resource
+
+        before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        result = func(*args, **kwargs)
+        after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        peak = max(before, after)
+        return result, peak
+
+    raise ValueError(f"Unknown memory backend {backend}")
+
+
+def _run_statsmodels(system, **fit_kwargs):
+    try:
+        from statsmodels.sur.sur_model import SUR as SMSUR
+    except ImportError:
+        return None, "missing"
+
+    model = SMSUR(system)
+    results = model.fit(**fit_kwargs)
+    sigma = np.asarray(results.sigma)
+    params = []
+    for name in system.keys():
+        params.append(np.asarray(results.params.loc[name]).reshape(-1, 1))
+    fitted = np.column_stack([results.predict(eq=name) for name in system.keys()])
+    return {"B": params, "sigma": sigma, "fitted": fitted, "resid": np.asarray(results.resid)}, "ok"
+
+
+def _run_linearmodels(system, **fit_kwargs):
+    try:
+        from linearmodels.system import SUR as LMSUR
+    except ImportError:
+        return None, "missing"
+
+    model = LMSUR(system)
+    results = model.fit(**fit_kwargs)
+    sigma = np.asarray(results.sigma)
+    params = [np.asarray(results.params[name]).reshape(-1, 1) for name in system.keys()]
+    fitted = np.column_stack([results.predict(eq=name) for name in system.keys()])
+    resid = np.column_stack([results.resids[name] for name in system.keys()])
+    return {"B": params, "sigma": sigma, "fitted": fitted, "resid": resid}, "ok"
+
+
+@dataclass
+class BenchmarkResult:
+    K: int
+    N: int
+    p: int
+    k: int
+    method: str
+    beta_rmse: Optional[float]
+    test_nll: Optional[float]
+    peak_memory: Optional[float]
+    wall_time: float
+    status: str
+
+
+def run_benchmark(
+    grid: Iterable[Tuple[int, int, int, int]],
+    *,
+    seed: int = 0,
+    memory_backend: str = "resource",
+) -> List[BenchmarkResult]:
+    results: List[BenchmarkResult] = []
+
+    for K, N, p, k in grid:
+        X_tr, Y_tr, X_te, Y_te, B_true, _, _ = _simulate_sur(
+            N_tr=N,
+            N_te=N // 2,
+            K=K,
+            p=p,
+            k=k,
+            seed=seed,
+        )
+
+        system = {f"eq{j}": (Y_tr[:, j], X_tr[j]) for j in range(K)}
+
+        def _als_run():
+            model = ALSGLS(rank=k, max_sweeps=12)
+            model.fit(X_tr, Y_tr)
+            preds = model.predict(X_te)
+            return model, preds
+
+        t0 = time.perf_counter()
+        (als_model, als_preds), als_peak = _maybe_memray_runner(_als_run, backend=memory_backend)
+        wall = time.perf_counter() - t0
+        beta_rmse = float(np.sqrt(np.mean((_stack_beta(als_model.B_list_) - _stack_beta(B_true)) ** 2)))
+        nll = float(nll_per_row(Y_te - als_preds, als_model.F_, als_model.D_))
+        results.append(
+            BenchmarkResult(
+                K=K,
+                N=N,
+                p=p,
+                k=k,
+                method="alsgls",
+                beta_rmse=beta_rmse,
+                test_nll=nll,
+                peak_memory=als_peak,
+                wall_time=wall,
+                status="ok",
+            )
+        )
+
+        for name, runner in ("statsmodels", _run_statsmodels), ("linearmodels", _run_linearmodels):
+            t0 = time.perf_counter()
+            payload, status = runner(system)
+            wall = time.perf_counter() - t0
+            if status != "ok":
+                results.append(
+                    BenchmarkResult(
+                        K=K,
+                        N=N,
+                        p=p,
+                        k=k,
+                        method=name,
+                        beta_rmse=None,
+                        test_nll=None,
+                        peak_memory=None,
+                        wall_time=wall,
+                        status=status,
+                    )
+                )
+                continue
+
+            beta = _stack_beta(payload["B"])
+            beta_rmse = float(np.sqrt(np.mean((beta - _stack_beta(B_true)) ** 2)))
+            resid_te = Y_te - XB_from_Blist(X_te, payload["B"])
+            nll = float(_gaussian_nll(resid_te, payload["sigma"]))
+            results.append(
+                BenchmarkResult(
+                    K=K,
+                    N=N,
+                    p=p,
+                    k=k,
+                    method=name,
+                    beta_rmse=beta_rmse,
+                    test_nll=nll,
+                    peak_memory=None,
+                    wall_time=wall,
+                    status="ok",
+                )
+            )
+
+    return results
+
+
+def parse_grid(K_vals, N_vals, p_vals, k_vals):
+    return list(itertools.product(K_vals, N_vals, p_vals, k_vals))
+
+
+def main(argv: Optional[Iterable[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--K", nargs="*", type=int, default=[20, 40])
+    parser.add_argument("--N", nargs="*", type=int, default=[200])
+    parser.add_argument("--p", nargs="*", type=int, default=[3])
+    parser.add_argument("--k", nargs="*", type=int, default=[2, 4])
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--memory-backend",
+        choices=["memray", "fil", "resource", "none"],
+        default="resource",
+    )
+    parser.add_argument("--json", type=Path, help="Optional path to dump JSON results")
+    args = parser.parse_args(argv)
+
+    grid = parse_grid(args.K, args.N, args.p, args.k)
+    results = run_benchmark(grid, seed=args.seed, memory_backend=args.memory_backend)
+
+    for row in results:
+        print(
+            f"K={row.K:3d} N={row.N:4d} p={row.p:2d} k={row.k:2d} | {row.method:12s} "
+            f"beta_RMSE={row.beta_rmse!r} test_NLL={row.test_nll!r} peak_mem={row.peak_memory!r} "
+            f"time={row.wall_time:.2f}s status={row.status}"
+        )
+
+    if args.json:
+        args.json.write_text(json.dumps([row.__dict__ for row in results], indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/source/api/alsgls.rst b/docs/source/api/alsgls.rst
index 60da38e..5ed9d11 100644
--- a/docs/source/api/alsgls.rst
+++ b/docs/source/api/alsgls.rst
@@ -33,6 +33,14 @@ alsgls.ops module
    :undoc-members:
    :show-inheritance:
 
+alsgls.api module
+-----------------
+
+.. automodule:: alsgls.api
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 alsgls.sim module
 -----------------
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index a140221..4288718 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -21,8 +21,9 @@ For development, clone the repository and install in editable mode:
 Basic Usage
 -----------
 
-The core functionality of alsgls revolves around the :func:`alsgls.als_gls` function, which implements 
-the Alternating Least Squares algorithm for low-rank+diagonal GLS estimation.
+The high-level :class:`alsgls.ALSGLS` estimator exposes a familiar scikit-learn
+API. Under the hood it calls :func:`alsgls.als_gls`, the alternating
+least-squares solver for low-rank-plus-diagonal GLS.
 
 Simulating Data
 ~~~~~~~~~~~~~~~
@@ -45,99 +46,100 @@ First, let's generate some synthetic data using the built-in simulation function
 Fitting the Model
 ~~~~~~~~~~~~~~~~~
 
-Now we can fit the ALS model to estimate the regression coefficients and factor structure:
+Fit the ALS model and inspect the diagnostic trace:
 
 .. code-block:: python
 
-   from alsgls import als_gls
+   from alsgls import ALSGLS
 
-   # Fit ALS model
-   B, F, D, memory_usage, convergence_info = als_gls(
-       Xs_train, 
-       Y_train, 
-       k=4,         # Factor rank
-       max_iter=10  # Maximum ALS iterations
-   )
+   estimator = ALSGLS(rank="auto", max_sweeps=12)
+   estimator.fit(Xs_train, Y_train)
+   print(f"Chosen rank: {estimator.rank_}")
+   print(f"NLL trace: {estimator.info_['nll_trace']}")
 
-The function returns:
+The fitted estimator exposes
 
-- ``B``: List of regression coefficient vectors for each equation
-- ``F``: Factor loadings matrix (K × k)
-- ``D``: Diagonal noise variances (K,)
-- ``memory_usage``: Peak memory usage during computation
-- ``convergence_info``: Dictionary with convergence statistics
+- ``B_list_``: list of regression coefficient vectors for each equation,
+- ``F_``: factor loadings matrix ``(K × rank_)``,
+- ``D_``: diagonal noise variances,
+- ``info_``: convergence diagnostics including the NLL trace and CG stats.
 
 Making Predictions
 ~~~~~~~~~~~~~~~~~~~
 
-Use the fitted coefficients to make predictions on new data:
+Use :meth:`alsgls.ALSGLS.predict` and :meth:`alsgls.ALSGLS.score` to evaluate on
+held-out data. ``score`` returns the negative Gaussian log-likelihood per
+observation (larger is better).
 
 .. code-block:: python
 
-   from alsgls import XB_from_Blist, nll_per_row
-
-   # Generate predictions
-   Y_pred = XB_from_Blist(Xs_test, B)
+   Y_pred = estimator.predict(Xs_test)
+   test_score = estimator.score(Xs_test, Y_test)
+   print(f"Test NLL per observation: {-test_score:.4f}")
 
-   # Evaluate using negative log-likelihood per row
-   residuals = Y_test - Y_pred
-   nll = nll_per_row(residuals, F, D)
-   print(f"Negative log-likelihood per row: {nll:.4f}")
+Statsmodels-style System API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Comparing with EM
-~~~~~~~~~~~~~~~~~
-
-You can also fit the same model using the EM algorithm for comparison:
+To mirror ``statsmodels`` and ``linearmodels`` SUR interfaces, use
+:class:`alsgls.ALSGLSSystem` with a dictionary mapping equation names to
+``(y, X)`` pairs:
 
 .. code-block:: python
 
-   from alsgls import em_gls
+   from alsgls import ALSGLSSystem
 
-   # Fit EM model (higher memory usage)
-   B_em, F_em, D_em, memory_em, _ = em_gls(
-       Xs_train, 
-       Y_train, 
-       k=4,
-       max_iter=50
-   )
+   system = {f"eq{j}": (Y_train[:, j], Xs_train[j]) for j in range(Y_train.shape[1])}
+   sys_model = ALSGLSSystem(system, rank="auto")
+   sys_results = sys_model.fit()
+   print(sys_results.summary_dict())
 
-   print(f"ALS memory usage: {memory_usage:.3f} MB")
-   print(f"EM memory usage: {memory_em:.3f} MB")
-   print(f"Memory ratio (EM/ALS): {memory_em/memory_usage:.1f}x")
+The returned :class:`alsgls.ALSGLSSystemResults` object stores the fitted
+coefficients, residuals, and NLL trace, and provides :meth:`predict` and
+``params_as_series`` (optional pandas dependency) for easy comparisons with
+classical SUR packages.
 
 Complete Example
 ~~~~~~~~~~~~~~~~
 
-Here's a complete working example:
+Here's a complete working example comparing ALS with the EM baseline:
 
 .. code-block:: python
 
-   from alsgls import (
-       simulate_sur, als_gls, em_gls, 
-       XB_from_Blist, nll_per_row, mse
-   )
+   from alsgls import simulate_sur, ALSGLS, em_gls, XB_from_Blist, mse
 
-   # Simulate data
    Xs_tr, Y_tr, Xs_te, Y_te = simulate_sur(N_tr=240, N_te=120, K=60, p=3, k=4)
 
-   # Fit both models
-   B_als, F_als, D_als, mem_als, _ = als_gls(Xs_tr, Y_tr, k=4)
-   B_em, F_em, D_em, mem_em, _ = em_gls(Xs_tr, Y_tr, k=4)
+   als = ALSGLS(rank=4, max_sweeps=10)
+   als.fit(Xs_tr, Y_tr)
+   Y_pred_als = als.predict(Xs_te)
 
-   # Compare predictions
-   Y_pred_als = XB_from_Blist(Xs_te, B_als)
+   B_em, F_em, D_em, mem_em, _ = em_gls(Xs_tr, Y_tr, k=4)
    Y_pred_em = XB_from_Blist(Xs_te, B_em)
 
    mse_als = mse(Y_te, Y_pred_als)
    mse_em = mse(Y_te, Y_pred_em)
 
-   print(f"ALS MSE: {mse_als:.6f}, Memory: {mem_als:.3f} MB")
-   print(f"EM MSE: {mse_em:.6f}, Memory: {mem_em:.3f} MB")
-   print(f"Memory savings: {mem_em/mem_als:.1f}x")
+   print(f"ALS MSE: {mse_als:.6f}")
+   print(f"EM MSE:  {mse_em:.6f}")
+   print(f"ALS sweeps used: {len(als.info_['nll_trace']) - 1}")
+
+Defaults and Troubleshooting
+----------------------------
+
+- **Rank heuristic** – The estimator uses ``min(8, ceil(K / 10))`` when
+  ``rank="auto"``; raise the rank if residual correlations persist, or lower it
+  to avoid overfitting tiny samples.
+- **Ridge parameters** – ``lam_F`` and ``lam_B`` default to ``1e-3``. Increase
+  them if the CG solver reports many iterations or the NLL trace stagnates.
+- **Diagonal floor** – ``d_floor`` keeps the diagonal noise positive. Tighten it
+  (e.g. ``1e-6``) in ill-conditioned problems to prevent breakdowns.
+- **Stopping criteria** – ALS stops when the relative improvement in the NLL is
+  below ``rel_tol`` (default ``1e-6``) or when ``max_sweeps`` is reached. Inspect
+  ``info_["nll_trace"]`` and ``info_["accept_t"]`` to diagnose plateaus.
 
 Next Steps
 ----------
 
 - Read about the :doc:`mathematical_background` behind the algorithms
 - Learn about the differences in :doc:`als_vs_em` approaches  
-- Explore more detailed :doc:`examples` and use cases
\ No newline at end of file
+- Explore more detailed :doc:`examples` and use cases
diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 0000000..e33780f
--- /dev/null
+++ b/tests/test_api.py
@@ -0,0 +1,62 @@
+import numpy as np
+
+from alsgls import ALSGLS, ALSGLSSystem, als_gls, nll_per_row
+from alsgls.ops import XB_from_Blist
+
+
+def _random_sur(rng, N=60, K=4, p=3):
+    Xs = [rng.standard_normal((N, p)) for _ in range(K)]
+    B = [rng.standard_normal((p, 1)) for _ in range(K)]
+    F = rng.standard_normal((K, 2)) / np.sqrt(K)
+    D = 0.4 + 0.2 * rng.random(K)
+    Z = rng.standard_normal((N, 2))
+    Y = XB_from_Blist(Xs, B) + Z @ F.T + rng.standard_normal((N, K)) * np.sqrt(D)[None, :]
+    return Xs, Y
+
+
+def test_sklearn_api_matches_function():
+    rng = np.random.default_rng(123)
+    Xs, Y = _random_sur(rng, N=120, K=5, p=4)
+
+    direct = als_gls(Xs, Y, k=3, sweeps=10, rel_tol=1e-8)
+    model = ALSGLS(rank=3, max_sweeps=10, rel_tol=1e-8)
+    fitted = model.fit(Xs, Y)
+
+    assert fitted is model
+    for b_direct, b_model in zip(direct[0], model.B_list_):
+        assert np.allclose(b_direct, b_model, atol=1e-8)
+    assert np.allclose(direct[1], model.F_, atol=1e-8)
+    assert np.allclose(direct[2], model.D_, atol=1e-8)
+
+    preds = model.predict(Xs)
+    assert np.allclose(preds, XB_from_Blist(Xs, model.B_list_), atol=1e-10)
+
+    score = model.score(Xs, Y)
+    nll = -score
+    assert np.isclose(nll, nll_per_row(Y - preds, model.F_, model.D_), atol=1e-10)
+
+
+def test_system_api_mirrors_estimator():
+    rng = np.random.default_rng(321)
+    Xs, Y = _random_sur(rng, N=80, K=3, p=2)
+
+    system = {f"eq{j}": (Y[:, j], Xs[j]) for j in range(3)}
+    sys_model = ALSGLSSystem(system, rank=2, max_sweeps=9, rel_tol=1e-8)
+    results = sys_model.fit()
+
+    assert results.model.keqs == 3
+    assert results.model.nobs == 80
+
+    preds = results.predict()
+    assert np.allclose(preds, XB_from_Blist(Xs, results.B_list), atol=1e-10)
+    assert np.allclose(results.fittedvalues, preds)
+
+    # Ensure the summary exposes key scalars
+    summary = results.summary_dict()
+    assert summary["keqs"] == 3
+    assert summary["nobs"] == 80
+    assert summary["rank"] == 2
+
+    # Recompute score and ensure consistency with estimator
+    estimator_score = sys_model.estimator_.score(Xs, Y)
+    assert np.isclose(estimator_score, -results.nll_per_row, atol=1e-10)
diff --git a/tests/test_identities.py b/tests/test_identities.py
index 0356fd5..ba117de 100644
--- a/tests/test_identities.py
+++ b/tests/test_identities.py
@@ -62,3 +62,20 @@ def test_nll_logdet_via_cholesky_consistency():
     val_dense = 0.5 * (quad_dense / N + logdet_dense + K * np.log(2.0 * np.pi))
 
     assert np.allclose(val, val_dense, rtol=RTOL, atol=ATOL)
+
+
+def test_determinant_lemma_matches_dense():
+    rng = np.random.default_rng(19)
+    K, k = 7, 3
+    F = rng.standard_normal((K, k))
+    D = rand_spd_diag(K, rng)
+
+    S = F @ F.T + np.diag(D)
+    logdet_dense = float(np.linalg.slogdet(S)[1])
+
+    Dinv, C_chol = woodbury_chol(F, D)
+    logdet_diag = np.sum(np.log(np.clip(D, 1e-30, None)))
+    logdet_small = 2.0 * np.sum(np.log(np.diag(C_chol)))
+    logdet_lemma = logdet_diag + logdet_small
+
+    assert np.allclose(logdet_dense, logdet_lemma, rtol=RTOL, atol=ATOL)
diff --git a/tests/test_rotation_invariance.py b/tests/test_rotation_invariance.py
new file mode 100644
index 0000000..d96e8ef
--- /dev/null
+++ b/tests/test_rotation_invariance.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+from alsgls import als_gls
+from alsgls.metrics import nll_per_row
+from alsgls.ops import XB_from_Blist
+
+
+def test_nll_invariant_under_factor_rotation():
+    rng = np.random.default_rng(5)
+    N, K, p, k = 150, 6, 4, 3
+    Xs = [rng.standard_normal((N, p)) for _ in range(K)]
+    B_true = [rng.standard_normal((p, 1)) for _ in range(K)]
+    F_true = rng.standard_normal((K, k)) / np.sqrt(K)
+    D_true = 0.2 + 0.3 * rng.random(K)
+    Z = rng.standard_normal((N, k))
+    Y = XB_from_Blist(Xs, B_true) + Z @ F_true.T + rng.standard_normal((N, K)) * np.sqrt(D_true)
+
+    B_hat, F_hat, D_hat, _, _ = als_gls(Xs, Y, k=k, sweeps=10, rel_tol=1e-8)
+    residuals = Y - XB_from_Blist(Xs, B_hat)
+    base = float(nll_per_row(residuals, F_hat, D_hat))
+
+    Q, _ = np.linalg.qr(rng.standard_normal((k, k)))
+    F_rot = F_hat @ Q
+    cov_diag = np.diag(F_hat @ F_hat.T + np.diag(D_hat))
+    D_rot = np.maximum(cov_diag - np.sum(F_rot * F_rot, axis=1), 1e-12)
+
+    rotated = float(nll_per_row(residuals, F_rot, D_rot))
+    assert np.isclose(base, rotated, rtol=5e-8, atol=5e-9)