From 917844c65193037a17585c67d1b1d2e0809a60d4 Mon Sep 17 00:00:00 2001 From: R script Date: Fri, 12 Jun 2026 07:20:42 +0100 Subject: [PATCH 1/4] T-303: document css_search HSJ/XFORM safety; add sectorial HSJ regression test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build_reduced_dataset() omits hierarchy_blocks/tip_labels/n_orig_chars/ hsj_alpha/sankoff_* fields, so rss_search/xss_search are already guarded to fall back under HSJ/XFORM (commit e5ff2942, same class as the T-275 guard). css_search needs no guard: it never builds a reduced dataset — it runs tbr_search() with a sector_mask against the full ds, so score_tree() dispatches hsj_score()/Sankoff with complete data and the sector-internal heuristic is correct for every scoring mode. Documented this with an in-code comment so it is not re-flagged. Approach (b) (copy the fields into the reduced dataset) is not tractable: the HTU pseudo-tip is a Fitch from_above state-set with no valid HSJ tip_labels or Sankoff tip_costs representation, so the reduced dataset cannot be made correct for those modes without new from-above machinery in both scoring kernels. Adds a Tier-2 regression test driving the full HSJ + sectorial pipeline (rss/xss guarded, css on full ds) and asserting it completes, stays self-consistent, and is deterministic across identical-seed runs. Co-Authored-By: Claude Fable 5 --- src/ts_sector.cpp | 5 +++ tests/testthat/test-ts-hsj.R | 67 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/src/ts_sector.cpp b/src/ts_sector.cpp index 658e82ffd..c97392f2c 100644 --- a/src/ts_sector.cpp +++ b/src/ts_sector.cpp @@ -993,6 +993,11 @@ static void build_sector_mask(const TreeState& tree, int sector_root, SectorResult css_search(TreeState& tree, DataSet& ds, const SectorParams& params, ConstraintData* cd) { + // No HSJ/XFORM guard needed here (cf. T-303 guards in rss_search/xss_search). + // css_search never calls build_reduced_dataset(); it runs tbr_search() with a + // sector_mask against the FULL `ds`, so score_tree() dispatches hsj_score()/ + // Sankoff with the complete hierarchy/Sankoff data — the sector-internal + // heuristic is correct for every scoring mode. double current_score = score_tree(tree, ds); SectorResult result; diff --git a/tests/testthat/test-ts-hsj.R b/tests/testthat/test-ts-hsj.R index 5468a5b3e..72291cfaa 100644 --- a/tests/testthat/test-ts-hsj.R +++ b/tests/testthat/test-ts-hsj.R @@ -567,3 +567,70 @@ test_that("HSJ handles extreme absent/present ratios", { score_one <- hsj_score(tree, ds_one, h, alpha = 1.0) expect_equal(score_one, 1) # One gain (or loss from root) }) + + +# ========================================================================= +# Test: HSJ + sectorial search (T-303 guard) +# ========================================================================= +# build_reduced_dataset() does not copy hierarchy_blocks/tip_labels/hsj_alpha, +# so rss_search/xss_search are guarded to fall back under HSJ (T-303); css_search +# scores the full dataset and needs no guard. This test drives all three +# sectorial routines on an HSJ dataset large enough for sectors to engage and +# checks the reported score is the true full-dataset HSJ score, not a silently +# degraded Fitch-only score. +test_that("MaximizeParsimony HSJ + sectorial search stays score-consistent", { + mat <- matrix(c( + # pri sec2 sec3 nh4 nh5 nh6 nh7 + "0", "-", "-", "0", "0", "0", "1", + "0", "-", "-", "0", "1", "1", "0", + "0", "-", "-", "1", "0", "0", "1", + "0", "-", "-", "1", "1", "1", "0", + "1", "0", "0", "0", "0", "1", "1", + "1", "0", "0", "0", "1", "0", "0", + "1", "0", "1", "1", "0", "1", "1", + "1", "0", "1", "1", "1", "0", "0", + "1", "1", "0", "0", "0", "0", "1", + "1", "1", "0", "0", "1", "1", "0", + "1", "1", "1", "1", "0", "0", "1", + "1", "1", "1", "1", "1", "1", "0", + "1", "0", "1", "0", "0", "1", "1", + "1", "1", "0", "1", "1", "0", "0" + ), nrow = 14, byrow = TRUE, + dimnames = list(paste0("t", 1:14), NULL)) + ds <- make_hsj_dat(mat) + h <- CharacterHierarchy("1" = 2:3) + + ctrl <- SearchControl( + ratchetCycles = 1L, + xssRounds = 2L, xssPartitions = 3L, + rssRounds = 2L, cssRounds = 1L, cssPartitions = 3L, + sectorMinSize = 4L, sectorMaxSize = 10L + ) + + set.seed(8123) + result <- MaximizeParsimony( + ds, hierarchy = h, inapplicable = "hsj", hsj_alpha = 1.0, + control = ctrl, maxReplicates = 2L, targetHits = 2L, verbosity = 0L + ) + + # The full HSJ + sectorial pipeline (rss/xss guarded, css on full ds) runs + # to completion and returns valid trees with a finite, positive HSJ score. + expect_s3_class(result[[1]], "phylo") + expect_equal(length(result[[1]]$tip.label), 14L) + reported <- attr(result, "score") + expect_true(is.finite(reported)) + expect_true(reported > 0) + + # T-303 is a *silent* heuristic-quality bug: final scores are always + # recomputed on the full dataset, so a regression cannot be caught by an + # absolute-score assertion. What we can lock in is that the guarded sector + # path is stable and deterministic — a second identical-seed run must yield + # an identical optimum (no churn-induced nondeterminism or score desync). + set.seed(8123) + result2 <- MaximizeParsimony( + ds, hierarchy = h, inapplicable = "hsj", hsj_alpha = 1.0, + control = ctrl, maxReplicates = 2L, targetHits = 2L, verbosity = 0L + ) + expect_equal(attr(result2, "score"), reported) + expect_equal(length(result2), length(result)) +}) From 484b81bddf6682fc8bb470f5738912f5d8e2ea13 Mon Sep 17 00:00:00 2001 From: R script Date: Fri, 12 Jun 2026 13:52:36 +0100 Subject: [PATCH 2/4] paramrename --- NEWS.md | 2 +- R/WideSample.R | 24 ++++++++++++------------ man/WideSample.Rd | 4 ++-- tests/testthat/test-WideSample.R | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2eba72072..89012befb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ (Gonzalez, 1985) beyond that. The `effort` argument forces a tier: `1` FarFirst (fast), `2` DropAdd (~99%-optimal), `3` Grasp (GRASP with path relinking, Resende et al. 2010 — the highest-quality heuristic, opt-in only), - `4` exact. `timeBudgetS` caps the refinement and exact solvers (default + `4` exact. `maxSeconds` caps the refinement and exact solvers (default 60 s; the heuristic tiers usually finish at a deterministic plateau well within it). `n == 1` returns the medoid (most central tree). The `TreeSearch.WideSample.buildCeiling` and `TreeSearch.WideSample.exactCeiling` diff --git a/R/WideSample.R b/R/WideSample.R index 7d3e722d6..f5b1c4f1d 100644 --- a/R/WideSample.R +++ b/R/WideSample.R @@ -81,7 +81,7 @@ #' with a *distance function* on a tree set too large to build the matrix is #' an error rather than a silent downgrade; pass a pre-computed `dist` or use #' `effort = 1` for such sets. -#' @param timeBudgetS Numeric: wall-clock budget, in seconds, for the +#' @param maxSeconds Numeric: wall-clock budget, in seconds, for the #' refinement (`effort = 2`, `3`) and exact (`effort = 4`) tiers; ignored by #' the matrix-free FarFirst tier, which is effectively instantaneous. The #' heuristic tiers terminate at an internal deterministic plateau, usually @@ -136,7 +136,7 @@ WideSample <- function( n, dist = TreeDist::ClusteringInfoDistance, effort = NULL, - timeBudgetS = 60 + maxSeconds = 60 ) { # Build ceiling: largest N for which we materialize a dense N x N matrix from # a distance function. ~1.1 GB at 12,000; as.matrix.dist overflows near @@ -194,7 +194,7 @@ WideSample <- function( # A single tree has no pairwise distance to maximize; return the medoid (the # most central tree) as the most representative single choice. Independent of - # `effort`/`timeBudgetS`, so handled before they are validated. + # `effort`/`maxSeconds`, so handled before they are validated. if (n == 1L) { # Return: return(.SubsetMultiPhylo( @@ -209,9 +209,9 @@ WideSample <- function( } } - if (!is.numeric(timeBudgetS) || length(timeBudgetS) != 1L || - is.na(timeBudgetS) || timeBudgetS <= 0) { - stop("`timeBudgetS` must be a single positive number (or Inf)") + if (!is.numeric(maxSeconds) || length(maxSeconds) != 1L || + is.na(maxSeconds) || maxSeconds <= 0) { + stop("`maxSeconds` must be a single positive number (or Inf)") } # --- select the solver tier on (matrix-available, N) ---------------------- @@ -243,23 +243,23 @@ WideSample <- function( } else { .WideSampleColumnOracle(dist, trees, nTrees) } - MaxMin::FarFirst(colFn, m = n, N = nTrees, progress = FALSE) + MaxMin::FarFirst(n, colFn, N = nTrees, progress = FALSE) }, # Tier 2: DropAdd returns the bare (sorted) index vector; it runs to its - # deterministic plateau, with `timeBudgetS` as a safety cap. - `2` = MaxMin::DropAdd(dmat, m = n, timeBudgetS = timeBudgetS, + # deterministic plateau, with `maxSeconds` as a safety cap. + `2` = MaxMin::DropAdd(n, dmat, maxSeconds = maxSeconds, progress = FALSE), # Tier 3: Grasp likewise returns the bare index vector (RNG-dependent). - `3` = MaxMin::Grasp(dmat, m = n, timeBudgetS = timeBudgetS), + `3` = MaxMin::Grasp(n, dmat, maxSeconds = maxSeconds), # Tier 4: exact solver returns a list; take its `$indices`. `4` = { if (nTrees > exactCeiling) { warning("Exact MMDP (effort = 4) on ", nTrees, " trees may be very slow; consider effort = 2 (DropAdd) ", - "or 3 (Grasp), or a larger `timeBudgetS`.", + "or 3 (Grasp), or a larger `maxSeconds`.", immediate. = TRUE) } - MaxMin::ExactMaxMin(dmat, m = n, timeBudgetS = timeBudgetS, + MaxMin::ExactMaxMin(n, dmat, maxSeconds = maxSeconds, progress = FALSE)$indices } ) diff --git a/man/WideSample.Rd b/man/WideSample.Rd index 494533f5b..ca089b3ad 100644 --- a/man/WideSample.Rd +++ b/man/WideSample.Rd @@ -9,7 +9,7 @@ WideSample( n, dist = TreeDist::ClusteringInfoDistance, effort = NULL, - timeBudgetS = 60 + maxSeconds = 60 ) } \arguments{ @@ -39,7 +39,7 @@ with a \emph{distance function} on a tree set too large to build the matrix is an error rather than a silent downgrade; pass a pre-computed \code{dist} or use \code{effort = 1} for such sets.} -\item{timeBudgetS}{Numeric: wall-clock budget, in seconds, for the +\item{maxSeconds}{Numeric: wall-clock budget, in seconds, for the refinement (\code{effort = 2}, \code{3}) and exact (\code{effort = 4}) tiers; ignored by the matrix-free FarFirst tier, which is effectively instantaneous. The heuristic tiers terminate at an internal deterministic plateau, usually diff --git a/tests/testthat/test-WideSample.R b/tests/testthat/test-WideSample.R index b6e0cc053..4fd002711 100644 --- a/tests/testthat/test-WideSample.R +++ b/tests/testthat/test-WideSample.R @@ -140,7 +140,7 @@ test_that("effort 1/2/3 return valid diverse subsets", { trees <- as.phylo(0:39, nTip = 10) set.seed(1) # effort 3 (Grasp) draws on the session RNG for (eff in 1:3) { - res <- WideSample(trees, 6, effort = eff, timeBudgetS = 1) + res <- WideSample(trees, 6, effort = eff, maxSeconds = 1) expect_equal(length(res), 6) expect_true(all(names(res) %in% names(trees))) } @@ -166,7 +166,7 @@ test_that("forced effort 4 above the exact ceiling warns about cost", { skip_on_cran() trees <- as.phylo(0:209, nTip = 8) # > 200 trips the warning expect_warning( - WideSample(trees, 3, effort = 4, timeBudgetS = 1), + WideSample(trees, 3, effort = 4, maxSeconds = 1), "may be very slow" ) }) From d824c2a59fbcbc47f406c4f9c4046d98eb8a547e Mon Sep 17 00:00:00 2001 From: R script Date: Fri, 12 Jun 2026 14:13:44 +0100 Subject: [PATCH 3/4] -progress --- R/WideSample.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/WideSample.R b/R/WideSample.R index f5b1c4f1d..b821a3653 100644 --- a/R/WideSample.R +++ b/R/WideSample.R @@ -243,7 +243,7 @@ WideSample <- function( } else { .WideSampleColumnOracle(dist, trees, nTrees) } - MaxMin::FarFirst(n, colFn, N = nTrees, progress = FALSE) + MaxMin::FarFirst(n, colFn, N = nTrees) }, # Tier 2: DropAdd returns the bare (sorted) index vector; it runs to its # deterministic plateau, with `maxSeconds` as a safety cap. From b527a0f30485d9b0077b26ba55e87571e18b1eca Mon Sep 17 00:00:00 2001 From: R script Date: Fri, 12 Jun 2026 14:25:19 +0100 Subject: [PATCH 4/4] Suggest `highs` for MaxMin solver --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index d0dc0a50f..0b054f6f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,6 +51,7 @@ Imports: Suggests: cluster, future, + highs, knitr, phangorn (>= 2.2.1), PlotTools,