diff --git a/README.md b/README.md index 3fa9761..976f8c6 100644 --- a/README.md +++ b/README.md @@ -25,15 +25,26 @@ pip install -e . ### Usage ```python -from alsgls import als_gls, simulate_sur, nll_per_row, XB_from_Blist +from alsgls import ALSGLS, ALSGLSSystem, simulate_sur Xs_tr, Y_tr, Xs_te, Y_te = simulate_sur(N_tr=240, N_te=120, K=60, p=3, k=4) -B, F, D, mem, _ = als_gls(Xs_tr, Y_tr, k=4) -Yhat_te = XB_from_Blist(Xs_te, B) -nll = nll_per_row(Y_te - Yhat_te, F, D) + +# Scikit-learn style estimator +est = ALSGLS(rank="auto", max_sweeps=12) +est.fit(Xs_tr, Y_tr) +test_score = est.score(Xs_te, Y_te) # negative test NLL per observation + +# Statsmodels-style system interface +system = {f"eq{j}": (Y_tr[:, j], Xs_tr[j]) for j in range(Y_tr.shape[1])} +sys_model = ALSGLSSystem(system, rank="auto") +sys_results = sys_model.fit() +params = sys_results.params_as_series() # pandas optional ``` -See `examples/compare_als_vs_em.py` for a complete ALS versus EM comparison. +See `examples/compare_als_vs_em.py` for a complete ALS versus EM comparison. The +`benchmarks/compare_sur.py` script contrasts ALS-GLS with `statsmodels` and +`linearmodels` SUR implementations on matched simulation grids while recording +peak memory (via Memray, Fil, or the POSIX RSS high-water mark). ### Documentation and notebooks @@ -63,3 +74,24 @@ To show the magnitude, we ran a Monte‑Carlo experiment with N = 300 observat Statistically, the two estimators are indistinguishable (paired‑test p ≥ 0.14). Computationally, ALS needs only a few megabytes whereas EM needs tens to hundreds. +### Defaults, tuning knobs, and failure modes + +- **Rank (`k`)** – By default the high-level APIs pick `min(8, ceil(K / 10))`, a + conservative fraction of the number of equations. Increase `rank` if the + cross-equation correlation matrix is slow to decay; decrease it when the + diagonal dominates. +- **ALS ridge terms (`lam_F`, `lam_B`)** – Defaults to `1e-3` for both the + latent-factor and regression updates; raise them slightly (e.g. `1e-2`) if CG + struggles to converge or the NLL trace plateaus early. +- **Noise floor (`d_floor`)** – Keeps the diagonal component positive; the + default `1e-8` protects against breakdowns when an equation is nearly + deterministic. Increase it in highly ill-conditioned settings. +- **Stopping criteria** – ALS stops when the relative drop in NLL per sweep is + below `1e-6` (configurable via `rel_tol`) or after `max_sweeps`. Inspect + `info["nll_trace"]` to diagnose stagnation. +- **Possible failures** – Large condition numbers or nearly-collinear regressors + can make the β-step CG solve slow; adjust `cg_tol`/`cg_maxit`, add stronger + ridge, or re-scale predictors. If `info["accept_t"]` stays at zero and the + NLL does not improve, the factor rank may be too large relative to the sample + size. + diff --git a/alsgls/__init__.py b/alsgls/__init__.py index a1f15da..6606a64 100644 --- a/alsgls/__init__.py +++ b/alsgls/__init__.py @@ -1,7 +1,19 @@ from .als import als_gls +from .api import ALSGLS, ALSGLSSystem, ALSGLSSystemResults from .em import em_gls from .metrics import mse, nll_per_row -from .sim import simulate_sur, simulate_gls from .ops import XB_from_Blist +from .sim import simulate_gls, simulate_sur -__all__ = ["als_gls", "em_gls", "mse", "nll_per_row", "simulate_sur", "simulate_gls", "XB_from_Blist"] +__all__ = [ + "ALSGLS", + "ALSGLSSystem", + "ALSGLSSystemResults", + "XB_from_Blist", + "als_gls", + "em_gls", + "mse", + "nll_per_row", + "simulate_gls", + "simulate_sur", +] diff --git a/alsgls/als.py b/alsgls/als.py index 9e75eee..c0b5d07 100644 --- a/alsgls/als.py +++ b/alsgls/als.py @@ -24,6 +24,7 @@ def als_gls( *, scale_correct: bool = True, scale_floor: float = 1e-8, + rel_tol: float = 1e-6, ): """ Alternating-least-squares GLS with low-rank-plus-diagonal covariance. @@ -51,6 +52,7 @@ def als_gls( cg_tol : float CG relative tolerance scale_correct : bool if True, try guarded MLE scale fix on Σ each sweep scale_floor : float min scalar for scale correction + rel_tol : float relative NLL improvement threshold for early stopping Returns ------- @@ -81,6 +83,8 @@ def als_gls( raise ValueError(f"k must be between 1 and min(K={K}, N={N})") if lam_F < 0 or lam_B < 0: raise ValueError("Regularization parameters must be non-negative") + if rel_tol < 0: + raise ValueError("rel_tol must be non-negative") p_list = [X.shape[1] for X in Xs] @@ -242,7 +246,7 @@ def try_with_scale(F_try, D_try): # Convergence: stop if relative improvement w.r.t previous post-Σ NLL is tiny rel_impr = (nll_prev - nll_curr) / max(1.0, abs(nll_prev)) nll_prev = nll_curr - if rel_impr < 1e-6: + if rel_impr < rel_tol: break # Memory estimate: F (K×k) + D (K) + U (N×k) doubles diff --git a/alsgls/api.py b/alsgls/api.py new file mode 100644 index 0000000..6ee4bda --- /dev/null +++ b/alsgls/api.py @@ -0,0 +1,345 @@ +"""High-level estimator APIs for ALS-based GLS fitting.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple + +import numpy as np + +from .als import als_gls +from .metrics import nll_per_row +from .ops import XB_from_Blist + + +def _auto_rank(num_equations: int) -> int: + """Heuristic rank used when the user does not provide one.""" + + if num_equations <= 0: + raise ValueError("num_equations must be positive") + # Cap the rank to avoid chasing noise; allow moderate growth with K. + return max(1, min(8, int(np.ceil(num_equations / 10)))) + + +def _asarray_2d(x: Any, *, dtype: np.dtype = np.float64) -> np.ndarray: + """Convert array-like input to a 2D ``numpy.ndarray``.""" + + if hasattr(x, "to_numpy"): + arr = x.to_numpy() + else: + arr = np.asarray(x) + arr = np.asarray(arr, dtype=dtype) + if arr.ndim == 1: + arr = arr[:, None] + if arr.ndim != 2: + raise ValueError("Input must be convertible to a 2D array") + return arr + + +def _column_names(obj: Any, size: int) -> List[str]: + if hasattr(obj, "columns"): + return list(obj.columns) + if hasattr(obj, "dtype") and getattr(obj.dtype, "names", None): + return list(obj.dtype.names) + return [f"x{i}" for i in range(size)] + + +def _eq_name(name: Any, index: int) -> str: + return str(name) if name is not None else f"eq{index}" + + +class ALSGLS: + """Scikit-learn style estimator for low-rank GLS via ALS.""" + + def __init__( + self, + *, + rank: Optional[int | str] = "auto", + lam_F: float = 1e-3, + lam_B: float = 1e-3, + max_sweeps: int = 12, + rel_tol: float = 1e-6, + d_floor: float = 1e-8, + cg_maxit: int = 800, + cg_tol: float = 3e-7, + scale_correct: bool = True, + scale_floor: float = 1e-8, + ) -> None: + self.rank = rank + self.lam_F = lam_F + self.lam_B = lam_B + self.max_sweeps = max_sweeps + self.rel_tol = rel_tol + self.d_floor = d_floor + self.cg_maxit = cg_maxit + self.cg_tol = cg_tol + self.scale_correct = scale_correct + self.scale_floor = scale_floor + + # ------------------------------------------------------------------ + # Scikit-learn estimator protocol + # ------------------------------------------------------------------ + def get_params(self, deep: bool = True) -> Dict[str, Any]: # noqa: D401 - sklearn API + return { + "rank": self.rank, + "lam_F": self.lam_F, + "lam_B": self.lam_B, + "max_sweeps": self.max_sweeps, + "rel_tol": self.rel_tol, + "d_floor": self.d_floor, + "cg_maxit": self.cg_maxit, + "cg_tol": self.cg_tol, + "scale_correct": self.scale_correct, + "scale_floor": self.scale_floor, + } + + def set_params(self, **params: Any) -> "ALSGLS": # noqa: D401 - sklearn API + for key, value in params.items(): + if not hasattr(self, key): + raise ValueError(f"Unknown parameter {key!r}") + setattr(self, key, value) + return self + + # ------------------------------------------------------------------ + # Fitting / inference + # ------------------------------------------------------------------ + def fit(self, Xs: Sequence[Any], Y: Any) -> "ALSGLS": + X_list = [_asarray_2d(X) for X in Xs] + Y_arr = _asarray_2d(Y) + + N, K = Y_arr.shape + if len(X_list) != K: + raise ValueError(f"Received {len(X_list)} design matrices for {K} equations") + + for j, X in enumerate(X_list): + if X.shape[0] != N: + raise ValueError(f"X[{j}] has {X.shape[0]} rows but Y has {N}") + + if self.rank == "auto" or self.rank is None: + k = _auto_rank(K) + else: + k = int(self.rank) + + if not (1 <= k <= min(K, N)): + raise ValueError(f"rank must be in [1, min(K={K}, N={N})]") + + B_list, F, D, mem_mb, info = als_gls( + X_list, + Y_arr, + k=k, + lam_F=self.lam_F, + lam_B=self.lam_B, + sweeps=self.max_sweeps, + d_floor=self.d_floor, + cg_maxit=self.cg_maxit, + cg_tol=self.cg_tol, + scale_correct=self.scale_correct, + scale_floor=self.scale_floor, + rel_tol=self.rel_tol, + ) + + self.B_list_ = B_list + self.F_ = F + self.D_ = D + self.mem_mb_est_ = mem_mb + self.info_ = info + self.n_features_in_ = tuple(X.shape[1] for X in X_list) + self.n_targets_ = K + self.n_obs_ = N + self.rank_ = k + self.training_residuals_ = Y_arr - XB_from_Blist(X_list, B_list) + self.is_fitted_ = True + return self + + def _ensure_fitted(self) -> None: + if not getattr(self, "is_fitted_", False): + raise RuntimeError("The estimator has not been fitted yet") + + def predict(self, Xs: Sequence[Any]) -> np.ndarray: + self._ensure_fitted() + X_list = [_asarray_2d(X) for X in Xs] + if len(X_list) != len(self.B_list_): + raise ValueError("Number of design matrices does not match fitted model") + for j, (X, B) in enumerate(zip(X_list, self.B_list_)): + if X.shape[1] != B.shape[0]: + raise ValueError(f"X[{j}] has {X.shape[1]} columns but expected {B.shape[0]}") + return XB_from_Blist(X_list, self.B_list_) + + def score(self, Xs: Sequence[Any], Y: Any) -> float: + self._ensure_fitted() + Y_arr = _asarray_2d(Y) + if Y_arr.shape[1] != self.n_targets_: + raise ValueError("Y has incompatible number of targets") + preds = self.predict(Xs) + if preds.shape != Y_arr.shape: + raise ValueError("Predictions and Y have incompatible shapes") + residual = Y_arr - preds + return -float(nll_per_row(residual, self.F_, self.D_)) + + +@dataclass +class _SystemEquation: + name: str + y: np.ndarray + X: np.ndarray + column_names: List[str] + + +class ALSGLSSystem: + """Statsmodels-style system container for ALS GLS fitting.""" + + def __init__( + self, + system: Mapping[Any, Tuple[Any, Any]] | Sequence[Tuple[Any, Tuple[Any, Any]]], + *, + rank: Optional[int | str] = "auto", + lam_F: float = 1e-3, + lam_B: float = 1e-3, + max_sweeps: int = 12, + rel_tol: float = 1e-6, + d_floor: float = 1e-8, + cg_maxit: int = 800, + cg_tol: float = 3e-7, + scale_correct: bool = True, + scale_floor: float = 1e-8, + ) -> None: + if isinstance(system, Mapping): + items = list(system.items()) + else: + items = list(system) + if len(items) == 0: + raise ValueError("system must contain at least one equation") + + equations: List[_SystemEquation] = [] + n_obs: Optional[int] = None + + for idx, (name, (y, X)) in enumerate(items): + y_arr = _asarray_2d(y) + X_arr = _asarray_2d(X) + if y_arr.shape[1] != 1: + raise ValueError("Each equation's response must be 1D") + y_arr = y_arr.reshape(-1, 1) + if n_obs is None: + n_obs = y_arr.shape[0] + elif y_arr.shape[0] != n_obs: + raise ValueError("All equations must share the same number of rows") + if X_arr.shape[0] != n_obs: + raise ValueError("Design matrix rows must match the response length") + equations.append( + _SystemEquation( + name=_eq_name(name, idx), + y=y_arr, + X=X_arr, + column_names=_column_names(X, X_arr.shape[1]), + ) + ) + + self._equations = equations + self.rank = rank + self.lam_F = lam_F + self.lam_B = lam_B + self.max_sweeps = max_sweeps + self.rel_tol = rel_tol + self.d_floor = d_floor + self.cg_maxit = cg_maxit + self.cg_tol = cg_tol + self.scale_correct = scale_correct + self.scale_floor = scale_floor + + @property + def nobs(self) -> int: + return self._equations[0].y.shape[0] + + @property + def keqs(self) -> int: + return len(self._equations) + + def as_arrays(self) -> Tuple[List[np.ndarray], np.ndarray]: + Xs = [eq.X for eq in self._equations] + Y = np.column_stack([eq.y for eq in self._equations]) + return Xs, Y + + def fit(self) -> "ALSGLSSystemResults": + estimator = ALSGLS( + rank=self.rank, + lam_F=self.lam_F, + lam_B=self.lam_B, + max_sweeps=self.max_sweeps, + rel_tol=self.rel_tol, + d_floor=self.d_floor, + cg_maxit=self.cg_maxit, + cg_tol=self.cg_tol, + scale_correct=self.scale_correct, + scale_floor=self.scale_floor, + ) + Xs, Y = self.as_arrays() + estimator.fit(Xs, Y) + self.estimator_ = estimator + result = ALSGLSSystemResults(self, estimator) + self.result_ = result + return result + + +class ALSGLSSystemResults: + """Lightweight results container mimicking ``statsmodels`` outputs.""" + + def __init__(self, model: ALSGLSSystem, estimator: ALSGLS) -> None: + self.model = model + self.estimator = estimator + Xs, Y = model.as_arrays() + + flattened = [b.ravel() for b in estimator.B_list_ if b.size] + self.params = np.concatenate(flattened) if flattened else np.empty(0) + self.param_labels = [ + (eq.name, col) + for eq in model._equations + for col in eq.column_names + ] + self.B_list = estimator.B_list_ + self.F = estimator.F_ + self.D = estimator.D_ + self.mem_mb_est = estimator.mem_mb_est_ + self.info = estimator.info_ + self.rank = estimator.rank_ + + self.fittedvalues = XB_from_Blist(Xs, self.B_list) + self.resids = Y - self.fittedvalues + self.nll_per_row = float(nll_per_row(self.resids, self.F, self.D)) + self.loglike = -self.nll_per_row * self.model.nobs + + def params_as_series(self): + try: + import pandas as pd # type: ignore + except ImportError as exc: # pragma: no cover - optional dependency + raise RuntimeError("pandas is required for params_as_series()") from exc + + mi = pd.MultiIndex.from_tuples(self.param_labels, names=["equation", "variable"]) + return pd.Series(self.params, index=mi) + + def predict(self, exog: Optional[Mapping[Any, Any] | Sequence[Any]] = None) -> np.ndarray: + if exog is None: + Xs = [eq.X for eq in self.model._equations] + else: + if isinstance(exog, Mapping): + items = [exog[eq.name] for eq in self.model._equations] + else: + items = list(exog) + if len(items) != len(self.model._equations): + raise ValueError("Expected design matrices for all equations") + Xs = [] + for item, eq in zip(items, self.model._equations): + arr = _asarray_2d(item) + if arr.shape[1] != eq.X.shape[1]: + raise ValueError("Design matrix has incompatible number of columns") + Xs.append(arr) + return XB_from_Blist(Xs, self.B_list) + + def summary_dict(self) -> Dict[str, Any]: + return { + "rank": self.rank, + "nobs": self.model.nobs, + "keqs": self.model.keqs, + "mem_mb_est": self.mem_mb_est, + "nll_per_row": self.nll_per_row, + "loglike": self.loglike, + } diff --git a/benchmarks/compare_sur.py b/benchmarks/compare_sur.py new file mode 100644 index 0000000..bf7f1db --- /dev/null +++ b/benchmarks/compare_sur.py @@ -0,0 +1,260 @@ +"""Benchmark ALS-GLS against statsmodels and linearmodels SUR implementations.""" + +from __future__ import annotations + +import argparse +import itertools +import json +import math +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + +import numpy as np + +from alsgls import ALSGLS, nll_per_row +from alsgls.ops import XB_from_Blist + + +def _simulate_sur(N_tr: int, N_te: int, K: int, p: int, k: int, seed: int): + rng = np.random.default_rng(seed) + N = N_tr + N_te + Xs = [rng.standard_normal((N, p)) for _ in range(K)] + B = [rng.standard_normal((p, 1)) for _ in range(K)] + F = rng.standard_normal((K, k)) / np.sqrt(max(K, 1)) + D = 0.2 + 0.3 * rng.random(K) + Z = rng.standard_normal((N, k)) + Y = XB_from_Blist(Xs, B) + Z @ F.T + rng.standard_normal((N, K)) * np.sqrt(D)[None, :] + return ( + [X[:N_tr] for X in Xs], + Y[:N_tr], + [X[N_tr:] for X in Xs], + Y[N_tr:], + B, + F, + D, + ) + + +def _stack_beta(B_list: Iterable[np.ndarray]) -> np.ndarray: + blocks = [np.asarray(b).ravel() for b in B_list] + return np.concatenate(blocks) if blocks else np.empty(0) + + +def _gaussian_nll(residuals: np.ndarray, sigma: np.ndarray) -> float: + sign, logdet = np.linalg.slogdet(sigma) + if sign <= 0: + raise ValueError("Covariance matrix is not SPD") + inv = np.linalg.inv(sigma) + quad = np.sum(residuals @ inv * residuals) / residuals.shape[0] + return 0.5 * (quad + logdet + residuals.shape[1] * math.log(2.0 * math.pi)) + + +def _maybe_memray_runner(func, *args, **kwargs): + backend = kwargs.pop("backend", "memray") + if backend == "none": + return func(*args, **kwargs), None + + if backend == "memray": + try: + import memray + except ImportError: + raise RuntimeError("memray is not installed; install memray or use --memory-backend none") + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "run.bin" + with memray.Tracker(path): + result = func(*args, **kwargs) + reader = memray.FileReader(path) + peak = reader.metadata.peak_memory + return result, peak + + if backend == "fil": + raise RuntimeError( + "Fil profiler integration requires running this script via `fil-profile`; " + "re-run with --memory-backend none when invoking through fil." + ) + + if backend == "resource": + import resource + + before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + result = func(*args, **kwargs) + after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + peak = max(before, after) + return result, peak + + raise ValueError(f"Unknown memory backend {backend}") + + +def _run_statsmodels(system, **fit_kwargs): + try: + from statsmodels.sur.sur_model import SUR as SMSUR + except ImportError: + return None, "missing" + + model = SMSUR(system) + results = model.fit(**fit_kwargs) + sigma = np.asarray(results.sigma) + params = [] + for name in system.keys(): + params.append(np.asarray(results.params.loc[name]).reshape(-1, 1)) + fitted = np.column_stack([results.predict(eq=name) for name in system.keys()]) + return {"B": params, "sigma": sigma, "fitted": fitted, "resid": np.asarray(results.resid)}, "ok" + + +def _run_linearmodels(system, **fit_kwargs): + try: + from linearmodels.system import SUR as LMSUR + except ImportError: + return None, "missing" + + model = LMSUR(system) + results = model.fit(**fit_kwargs) + sigma = np.asarray(results.sigma) + params = [np.asarray(results.params[name]).reshape(-1, 1) for name in system.keys()] + fitted = np.column_stack([results.predict(eq=name) for name in system.keys()]) + resid = np.column_stack([results.resids[name] for name in system.keys()]) + return {"B": params, "sigma": sigma, "fitted": fitted, "resid": resid}, "ok" + + +@dataclass +class BenchmarkResult: + K: int + N: int + p: int + k: int + method: str + beta_rmse: Optional[float] + test_nll: Optional[float] + peak_memory: Optional[float] + wall_time: float + status: str + + +def run_benchmark( + grid: Iterable[Tuple[int, int, int, int]], + *, + seed: int = 0, + memory_backend: str = "resource", +) -> List[BenchmarkResult]: + results: List[BenchmarkResult] = [] + + for K, N, p, k in grid: + X_tr, Y_tr, X_te, Y_te, B_true, _, _ = _simulate_sur( + N_tr=N, + N_te=N // 2, + K=K, + p=p, + k=k, + seed=seed, + ) + + system = {f"eq{j}": (Y_tr[:, j], X_tr[j]) for j in range(K)} + + def _als_run(): + model = ALSGLS(rank=k, max_sweeps=12) + model.fit(X_tr, Y_tr) + preds = model.predict(X_te) + return model, preds + + t0 = time.perf_counter() + (als_model, als_preds), als_peak = _maybe_memray_runner(_als_run, backend=memory_backend) + wall = time.perf_counter() - t0 + beta_rmse = float(np.sqrt(np.mean((_stack_beta(als_model.B_list_) - _stack_beta(B_true)) ** 2))) + nll = float(nll_per_row(Y_te - als_preds, als_model.F_, als_model.D_)) + results.append( + BenchmarkResult( + K=K, + N=N, + p=p, + k=k, + method="alsgls", + beta_rmse=beta_rmse, + test_nll=nll, + peak_memory=als_peak, + wall_time=wall, + status="ok", + ) + ) + + for name, runner in ("statsmodels", _run_statsmodels), ("linearmodels", _run_linearmodels): + t0 = time.perf_counter() + payload, status = runner(system) + wall = time.perf_counter() - t0 + if status != "ok": + results.append( + BenchmarkResult( + K=K, + N=N, + p=p, + k=k, + method=name, + beta_rmse=None, + test_nll=None, + peak_memory=None, + wall_time=wall, + status=status, + ) + ) + continue + + beta = _stack_beta(payload["B"]) + beta_rmse = float(np.sqrt(np.mean((beta - _stack_beta(B_true)) ** 2))) + resid_te = Y_te - XB_from_Blist(X_te, payload["B"]) + nll = float(_gaussian_nll(resid_te, payload["sigma"])) + results.append( + BenchmarkResult( + K=K, + N=N, + p=p, + k=k, + method=name, + beta_rmse=beta_rmse, + test_nll=nll, + peak_memory=None, + wall_time=wall, + status="ok", + ) + ) + + return results + + +def parse_grid(K_vals, N_vals, p_vals, k_vals): + return list(itertools.product(K_vals, N_vals, p_vals, k_vals)) + + +def main(argv: Optional[Iterable[str]] = None) -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--K", nargs="*", type=int, default=[20, 40]) + parser.add_argument("--N", nargs="*", type=int, default=[200]) + parser.add_argument("--p", nargs="*", type=int, default=[3]) + parser.add_argument("--k", nargs="*", type=int, default=[2, 4]) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--memory-backend", + choices=["memray", "fil", "resource", "none"], + default="resource", + ) + parser.add_argument("--json", type=Path, help="Optional path to dump JSON results") + args = parser.parse_args(argv) + + grid = parse_grid(args.K, args.N, args.p, args.k) + results = run_benchmark(grid, seed=args.seed, memory_backend=args.memory_backend) + + for row in results: + print( + f"K={row.K:3d} N={row.N:4d} p={row.p:2d} k={row.k:2d} | {row.method:12s} " + f"beta_RMSE={row.beta_rmse!r} test_NLL={row.test_nll!r} peak_mem={row.peak_memory!r} " + f"time={row.wall_time:.2f}s status={row.status}" + ) + + if args.json: + args.json.write_text(json.dumps([row.__dict__ for row in results], indent=2)) + + +if __name__ == "__main__": + main() diff --git a/docs/source/api/alsgls.rst b/docs/source/api/alsgls.rst index 60da38e..5ed9d11 100644 --- a/docs/source/api/alsgls.rst +++ b/docs/source/api/alsgls.rst @@ -33,6 +33,14 @@ alsgls.ops module :undoc-members: :show-inheritance: +alsgls.api module +----------------- + +.. automodule:: alsgls.api + :members: + :undoc-members: + :show-inheritance: + alsgls.sim module ----------------- diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index a140221..4288718 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -21,8 +21,9 @@ For development, clone the repository and install in editable mode: Basic Usage ----------- -The core functionality of alsgls revolves around the :func:`alsgls.als_gls` function, which implements -the Alternating Least Squares algorithm for low-rank+diagonal GLS estimation. +The high-level :class:`alsgls.ALSGLS` estimator exposes a familiar scikit-learn +API. Under the hood it calls :func:`alsgls.als_gls`, the alternating +least-squares solver for low-rank-plus-diagonal GLS. Simulating Data ~~~~~~~~~~~~~~~ @@ -45,99 +46,100 @@ First, let's generate some synthetic data using the built-in simulation function Fitting the Model ~~~~~~~~~~~~~~~~~ -Now we can fit the ALS model to estimate the regression coefficients and factor structure: +Fit the ALS model and inspect the diagnostic trace: .. code-block:: python - from alsgls import als_gls + from alsgls import ALSGLS - # Fit ALS model - B, F, D, memory_usage, convergence_info = als_gls( - Xs_train, - Y_train, - k=4, # Factor rank - max_iter=10 # Maximum ALS iterations - ) + estimator = ALSGLS(rank="auto", max_sweeps=12) + estimator.fit(Xs_train, Y_train) + print(f"Chosen rank: {estimator.rank_}") + print(f"NLL trace: {estimator.info_['nll_trace']}") -The function returns: +The fitted estimator exposes -- ``B``: List of regression coefficient vectors for each equation -- ``F``: Factor loadings matrix (K × k) -- ``D``: Diagonal noise variances (K,) -- ``memory_usage``: Peak memory usage during computation -- ``convergence_info``: Dictionary with convergence statistics +- ``B_list_``: list of regression coefficient vectors for each equation, +- ``F_``: factor loadings matrix ``(K × rank_)``, +- ``D_``: diagonal noise variances, +- ``info_``: convergence diagnostics including the NLL trace and CG stats. Making Predictions ~~~~~~~~~~~~~~~~~~~ -Use the fitted coefficients to make predictions on new data: +Use :meth:`alsgls.ALSGLS.predict` and :meth:`alsgls.ALSGLS.score` to evaluate on +held-out data. ``score`` returns the negative Gaussian log-likelihood per +observation (larger is better). .. code-block:: python - from alsgls import XB_from_Blist, nll_per_row - - # Generate predictions - Y_pred = XB_from_Blist(Xs_test, B) + Y_pred = estimator.predict(Xs_test) + test_score = estimator.score(Xs_test, Y_test) + print(f"Test NLL per observation: {-test_score:.4f}") - # Evaluate using negative log-likelihood per row - residuals = Y_test - Y_pred - nll = nll_per_row(residuals, F, D) - print(f"Negative log-likelihood per row: {nll:.4f}") +Statsmodels-style System API +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Comparing with EM -~~~~~~~~~~~~~~~~~ - -You can also fit the same model using the EM algorithm for comparison: +To mirror ``statsmodels`` and ``linearmodels`` SUR interfaces, use +:class:`alsgls.ALSGLSSystem` with a dictionary mapping equation names to +``(y, X)`` pairs: .. code-block:: python - from alsgls import em_gls + from alsgls import ALSGLSSystem - # Fit EM model (higher memory usage) - B_em, F_em, D_em, memory_em, _ = em_gls( - Xs_train, - Y_train, - k=4, - max_iter=50 - ) + system = {f"eq{j}": (Y_train[:, j], Xs_train[j]) for j in range(Y_train.shape[1])} + sys_model = ALSGLSSystem(system, rank="auto") + sys_results = sys_model.fit() + print(sys_results.summary_dict()) - print(f"ALS memory usage: {memory_usage:.3f} MB") - print(f"EM memory usage: {memory_em:.3f} MB") - print(f"Memory ratio (EM/ALS): {memory_em/memory_usage:.1f}x") +The returned :class:`alsgls.ALSGLSSystemResults` object stores the fitted +coefficients, residuals, and NLL trace, and provides :meth:`predict` and +``params_as_series`` (optional pandas dependency) for easy comparisons with +classical SUR packages. Complete Example ~~~~~~~~~~~~~~~~ -Here's a complete working example: +Here's a complete working example comparing ALS with the EM baseline: .. code-block:: python - from alsgls import ( - simulate_sur, als_gls, em_gls, - XB_from_Blist, nll_per_row, mse - ) + from alsgls import simulate_sur, ALSGLS, em_gls, XB_from_Blist, mse - # Simulate data Xs_tr, Y_tr, Xs_te, Y_te = simulate_sur(N_tr=240, N_te=120, K=60, p=3, k=4) - # Fit both models - B_als, F_als, D_als, mem_als, _ = als_gls(Xs_tr, Y_tr, k=4) - B_em, F_em, D_em, mem_em, _ = em_gls(Xs_tr, Y_tr, k=4) + als = ALSGLS(rank=4, max_sweeps=10) + als.fit(Xs_tr, Y_tr) + Y_pred_als = als.predict(Xs_te) - # Compare predictions - Y_pred_als = XB_from_Blist(Xs_te, B_als) + B_em, F_em, D_em, mem_em, _ = em_gls(Xs_tr, Y_tr, k=4) Y_pred_em = XB_from_Blist(Xs_te, B_em) mse_als = mse(Y_te, Y_pred_als) mse_em = mse(Y_te, Y_pred_em) - print(f"ALS MSE: {mse_als:.6f}, Memory: {mem_als:.3f} MB") - print(f"EM MSE: {mse_em:.6f}, Memory: {mem_em:.3f} MB") - print(f"Memory savings: {mem_em/mem_als:.1f}x") + print(f"ALS MSE: {mse_als:.6f}") + print(f"EM MSE: {mse_em:.6f}") + print(f"ALS sweeps used: {len(als.info_['nll_trace']) - 1}") + +Defaults and Troubleshooting +---------------------------- + +- **Rank heuristic** – The estimator uses ``min(8, ceil(K / 10))`` when + ``rank="auto"``; raise the rank if residual correlations persist, or lower it + to avoid overfitting tiny samples. +- **Ridge parameters** – ``lam_F`` and ``lam_B`` default to ``1e-3``. Increase + them if the CG solver reports many iterations or the NLL trace stagnates. +- **Diagonal floor** – ``d_floor`` keeps the diagonal noise positive. Tighten it + (e.g. ``1e-6``) in ill-conditioned problems to prevent breakdowns. +- **Stopping criteria** – ALS stops when the relative improvement in the NLL is + below ``rel_tol`` (default ``1e-6``) or when ``max_sweeps`` is reached. Inspect + ``info_["nll_trace"]`` and ``info_["accept_t"]`` to diagnose plateaus. Next Steps ---------- - Read about the :doc:`mathematical_background` behind the algorithms - Learn about the differences in :doc:`als_vs_em` approaches -- Explore more detailed :doc:`examples` and use cases \ No newline at end of file +- Explore more detailed :doc:`examples` and use cases diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..e33780f --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,62 @@ +import numpy as np + +from alsgls import ALSGLS, ALSGLSSystem, als_gls, nll_per_row +from alsgls.ops import XB_from_Blist + + +def _random_sur(rng, N=60, K=4, p=3): + Xs = [rng.standard_normal((N, p)) for _ in range(K)] + B = [rng.standard_normal((p, 1)) for _ in range(K)] + F = rng.standard_normal((K, 2)) / np.sqrt(K) + D = 0.4 + 0.2 * rng.random(K) + Z = rng.standard_normal((N, 2)) + Y = XB_from_Blist(Xs, B) + Z @ F.T + rng.standard_normal((N, K)) * np.sqrt(D)[None, :] + return Xs, Y + + +def test_sklearn_api_matches_function(): + rng = np.random.default_rng(123) + Xs, Y = _random_sur(rng, N=120, K=5, p=4) + + direct = als_gls(Xs, Y, k=3, sweeps=10, rel_tol=1e-8) + model = ALSGLS(rank=3, max_sweeps=10, rel_tol=1e-8) + fitted = model.fit(Xs, Y) + + assert fitted is model + for b_direct, b_model in zip(direct[0], model.B_list_): + assert np.allclose(b_direct, b_model, atol=1e-8) + assert np.allclose(direct[1], model.F_, atol=1e-8) + assert np.allclose(direct[2], model.D_, atol=1e-8) + + preds = model.predict(Xs) + assert np.allclose(preds, XB_from_Blist(Xs, model.B_list_), atol=1e-10) + + score = model.score(Xs, Y) + nll = -score + assert np.isclose(nll, nll_per_row(Y - preds, model.F_, model.D_), atol=1e-10) + + +def test_system_api_mirrors_estimator(): + rng = np.random.default_rng(321) + Xs, Y = _random_sur(rng, N=80, K=3, p=2) + + system = {f"eq{j}": (Y[:, j], Xs[j]) for j in range(3)} + sys_model = ALSGLSSystem(system, rank=2, max_sweeps=9, rel_tol=1e-8) + results = sys_model.fit() + + assert results.model.keqs == 3 + assert results.model.nobs == 80 + + preds = results.predict() + assert np.allclose(preds, XB_from_Blist(Xs, results.B_list), atol=1e-10) + assert np.allclose(results.fittedvalues, preds) + + # Ensure the summary exposes key scalars + summary = results.summary_dict() + assert summary["keqs"] == 3 + assert summary["nobs"] == 80 + assert summary["rank"] == 2 + + # Recompute score and ensure consistency with estimator + estimator_score = sys_model.estimator_.score(Xs, Y) + assert np.isclose(estimator_score, -results.nll_per_row, atol=1e-10) diff --git a/tests/test_identities.py b/tests/test_identities.py index 0356fd5..ba117de 100644 --- a/tests/test_identities.py +++ b/tests/test_identities.py @@ -62,3 +62,20 @@ def test_nll_logdet_via_cholesky_consistency(): val_dense = 0.5 * (quad_dense / N + logdet_dense + K * np.log(2.0 * np.pi)) assert np.allclose(val, val_dense, rtol=RTOL, atol=ATOL) + + +def test_determinant_lemma_matches_dense(): + rng = np.random.default_rng(19) + K, k = 7, 3 + F = rng.standard_normal((K, k)) + D = rand_spd_diag(K, rng) + + S = F @ F.T + np.diag(D) + logdet_dense = float(np.linalg.slogdet(S)[1]) + + Dinv, C_chol = woodbury_chol(F, D) + logdet_diag = np.sum(np.log(np.clip(D, 1e-30, None))) + logdet_small = 2.0 * np.sum(np.log(np.diag(C_chol))) + logdet_lemma = logdet_diag + logdet_small + + assert np.allclose(logdet_dense, logdet_lemma, rtol=RTOL, atol=ATOL) diff --git a/tests/test_rotation_invariance.py b/tests/test_rotation_invariance.py new file mode 100644 index 0000000..d96e8ef --- /dev/null +++ b/tests/test_rotation_invariance.py @@ -0,0 +1,28 @@ +import numpy as np + +from alsgls import als_gls +from alsgls.metrics import nll_per_row +from alsgls.ops import XB_from_Blist + + +def test_nll_invariant_under_factor_rotation(): + rng = np.random.default_rng(5) + N, K, p, k = 150, 6, 4, 3 + Xs = [rng.standard_normal((N, p)) for _ in range(K)] + B_true = [rng.standard_normal((p, 1)) for _ in range(K)] + F_true = rng.standard_normal((K, k)) / np.sqrt(K) + D_true = 0.2 + 0.3 * rng.random(K) + Z = rng.standard_normal((N, k)) + Y = XB_from_Blist(Xs, B_true) + Z @ F_true.T + rng.standard_normal((N, K)) * np.sqrt(D_true) + + B_hat, F_hat, D_hat, _, _ = als_gls(Xs, Y, k=k, sweeps=10, rel_tol=1e-8) + residuals = Y - XB_from_Blist(Xs, B_hat) + base = float(nll_per_row(residuals, F_hat, D_hat)) + + Q, _ = np.linalg.qr(rng.standard_normal((k, k))) + F_rot = F_hat @ Q + cov_diag = np.diag(F_hat @ F_hat.T + np.diag(D_hat)) + D_rot = np.maximum(cov_diag - np.sum(F_rot * F_rot, axis=1), 1e-12) + + rotated = float(nll_per_row(residuals, F_rot, D_rot)) + assert np.isclose(base, rotated, rtol=5e-8, atol=5e-9)