From de836d76ca1c6eefd359cf9d1eec6242efb30570 Mon Sep 17 00:00:00 2001 From: R script Date: Tue, 2 Jun 2026 19:47:54 +0100 Subject: [PATCH 01/11] =?UTF-8?q?exact=E2=86=92hash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dev/red-team/verify-consensus.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/red-team/verify-consensus.R b/dev/red-team/verify-consensus.R index 76ea701ef..7594470ac 100644 --- a/dev/red-team/verify-consensus.R +++ b/dev/red-team/verify-consensus.R @@ -46,7 +46,7 @@ for (n_tip in c(4, 5, 6, 8, 13, 20, 50)) { class(forest) <- "multiPhylo" for (p in ps) { hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip) - exact <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip) + exact <- split_set(TreeTools:::consensus_tree(forest, p, hash = FALSE), n_tip) check(identical(hashed, exact), sprintf("consensus hashed!=exact n=%d k=%d p=%.3g", n_tip, k, p)) } @@ -74,7 +74,7 @@ for (nm in names(adversarial)) { n_tip <- NTip(forest[[1]]) for (p in ps) { hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip) - exact <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip) + exact <- split_set(TreeTools:::consensus_tree(forest, p, hash = FALSE), n_tip) check(identical(hashed, exact), sprintf("%s p=%.3g: hashed!=exact", nm, p)) } } @@ -85,8 +85,8 @@ for (n_tip in c(4, 6, 8, 13, 30)) { for (k in c(1, 2, 5, 12, 40)) { forest <- lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL)) class(forest) <- "multiPhylo" - sh <- SplitFrequency(forest, exact = FALSE) - se <- SplitFrequency(forest, exact = TRUE) + se <- SplitFrequency(forest, hash = FALSE) + sh <- SplitFrequency(forest, hash = TRUE) # Compare as (split -> count) maps, order-independent key <- function(s) { if (length(s) == 0) return(character(0)) From c82bf0357b5489226de67988153cf47017ab4bd5 Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 04:17:41 +0100 Subject: [PATCH 02/11] Fix dead hash= argument in verify-consensus red-team script consensus_tree()'s third argument is 'exact', not 'hash'; the script called consensus_tree(forest, p, hash = FALSE), which errored with an unused-argument error and so had not run since the C++ export was renamed. Use exact = TRUE; the script is now green (0 failures). Co-Authored-By: Claude Opus 4.8 --- dev/red-team/verify-consensus.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/red-team/verify-consensus.R b/dev/red-team/verify-consensus.R index 7594470ac..265bc3f47 100644 --- a/dev/red-team/verify-consensus.R +++ b/dev/red-team/verify-consensus.R @@ -46,7 +46,7 @@ for (n_tip in c(4, 5, 6, 8, 13, 20, 50)) { class(forest) <- "multiPhylo" for (p in ps) { hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip) - exact <- split_set(TreeTools:::consensus_tree(forest, p, hash = FALSE), n_tip) + exact <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip) check(identical(hashed, exact), sprintf("consensus hashed!=exact n=%d k=%d p=%.3g", n_tip, k, p)) } @@ -74,7 +74,7 @@ for (nm in names(adversarial)) { n_tip <- NTip(forest[[1]]) for (p in ps) { hashed <- split_set(TreeTools:::consensus_tree(forest, p), n_tip) - exact <- split_set(TreeTools:::consensus_tree(forest, p, hash = FALSE), n_tip) + exact <- split_set(TreeTools:::consensus_tree(forest, p, exact = TRUE), n_tip) check(identical(hashed, exact), sprintf("%s p=%.3g: hashed!=exact", nm, p)) } } From d333041559541d37a985fa23716c2769b87ece87 Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 04:18:37 +0100 Subject: [PATCH 03/11] Defer split materialisation in hashed consensus counter Majority/threshold Consensus() and SplitFrequency() built every distinct split's packed bit pattern eagerly on first sighting. At high tip counts most distinct splits never reach the consensus threshold, so that work was wasted. Each distinct split now keeps a 12-byte (tree, L, R) witness; the pattern is materialised only for splits that survive thresholding (consensus) or for all splits (SplitFrequency). Results are unchanged: split sets/counts identical to the deterministic exact path, verified at n up to 5000 (dev/profiling/correctness-gate.R, 590 checks; test-consensus 8/8; test-Support 6/6). Up to ~13x faster for large/tall trees, median 1.23x, no change at small sizes. Adds dev/profiling/ harness (correctness gate, benchmark grid, Jansson lower-bound analysis + DECISION.md explaining why the full deterministic Jansson algorithm is not implemented). Co-Authored-By: Claude Opus 4.8 --- .gitignore | 8 + NEWS.md | 4 + dev/profiling/DECISION.md | 100 +++++++++++++ dev/profiling/PLAN-consensus.md | 121 +++++++++++++++ dev/profiling/baselines.md | 52 +++++++ dev/profiling/correctness-gate.R | 197 +++++++++++++++++++++++++ dev/profiling/drivers/compare-grids.R | 34 +++++ dev/profiling/drivers/consensus-grid.R | 121 +++++++++++++++ dev/profiling/drivers/jansson-bound.R | 77 ++++++++++ dev/profiling/findings.md | 18 +++ dev/profiling/focus-areas.md | 16 ++ dev/profiling/log.md | 88 +++++++++++ src/consensus.cpp | 100 ++++++++++--- 13 files changed, 912 insertions(+), 24 deletions(-) create mode 100644 dev/profiling/DECISION.md create mode 100644 dev/profiling/PLAN-consensus.md create mode 100644 dev/profiling/baselines.md create mode 100644 dev/profiling/correctness-gate.R create mode 100644 dev/profiling/drivers/compare-grids.R create mode 100644 dev/profiling/drivers/consensus-grid.R create mode 100644 dev/profiling/drivers/jansson-bound.R create mode 100644 dev/profiling/findings.md create mode 100644 dev/profiling/focus-areas.md create mode 100644 dev/profiling/log.md diff --git a/.gitignore b/.gitignore index 83d996656..02c39d70d 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,11 @@ vignettes/*_cache* /*-benchmark-results *.bench.Rds .positai + +# /profile skill artefacts (large; rebuilt each round) +dev/profiling/.vtune-lib-*/ +dev/profiling/.libpath.txt +dev/profiling/result_*/ +dev/profiling/drivers/*-profvis.html +dev/profiling/*.csv +dev/profiling/*.rds diff --git a/NEWS.md b/NEWS.md index 42b539778..b4ad6f37e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,10 @@ - Guarantee preorder return from `root_on_node()` to simplify `Consensus()` internal pre-processing +- `Consensus()` and `SplitFrequency()` defer materialising a split's bit pattern + until it is needed, so splits that never reach the consensus threshold are no + longer built. Identical results; up to ~13× faster for large trees (greatest + gains for tall trees / many tips), with no change at small sizes. # TreeTools 2.4.0 (2026-06-02) # diff --git a/dev/profiling/DECISION.md b/dev/profiling/DECISION.md new file mode 100644 index 000000000..9772c8c05 --- /dev/null +++ b/dev/profiling/DECISION.md @@ -0,0 +1,100 @@ +# Consensus: profiling outcome & the Jansson decision + +**For MRS — read this first.** You asked me to implement Jansson's algorithm +alongside the current O(kn) consensus, optimise both, and let data pick a winner. + +## TL;DR + +1. **I did NOT implement the full Jansson (Maj_Rule_Plus) algorithm.** The data + say it cannot beat the (now-optimised) hashed counter in any realistic, + time-meaningful regime, and it is high-risk to get right (the authors' own + reference impl, FACT, ships **broken** majority code). The exact algorithm and + what a future implementation would need are written down below so it can be + built on request. +2. **I optimised the existing hashed counter** (deferred split materialisation): + **up to 13× faster** at high n (tall trees), median **1.23×**, with **zero + change to results** (split sets identical to the deterministic `exact` path, + verified at n up to 3000). This is the real, shipping win. +3. **hashed stays the default; `exact` stays the deterministic fallback.** No + crossover that warrants switching algorithms; one approach (hashed) wins + across the board — so, per your "avoid redundant code" steer, no third path. + +## Why not Jansson — the data + +Jansson's deterministic O(kn) majority algorithm exists to **count cluster +frequencies without hashing**, by matching each input tree against an evolving +O(n)-cluster candidate tree (Day's algorithm + `One_Way_Compatible` + +`Merge_Trees` + delete/insert). Its only advantage over the current code is +*determinism* (no ~1e-30 hash-collision risk — which you've explicitly waived) +and avoiding `exact`'s O(k·n·h) on tall trees. + +**Measured lower bound (rigorous):** Jansson ≥ 2× the strict-consensus path +(Phase 2 ≈ strict's k Day-matchings; Phase 1 ≥ Phase 2, adding +One_Way_Compatible + Merge_Trees + per-iteration table rebuilds). Strict's inner +loop is *tighter* than Jansson's, so this under-counts Jansson → the verdict is +conservative. Where `2×strict ≥ hashed`, **Jansson provably loses**. Results vs +the optimised hashed (`drivers/jansson-bound.R`): + +| regime | concordance | 2×strict / hashed | verdict | +|--------|-------------|-------------------|---------| +| high-k/low-n (all) | any | 1.1–1.8 | **Jansson loses (proven)** | +| high-k/high-n (2000,500) | concordant / moderate | 1.24 / 1.54 | **loses (proven)** | +| high-k/high-n (1000,1000) | concordant / moderate | 1.32 / 1.42 | **loses (proven)** | +| low-k/high-n (5000,10) | concordant / moderate | 0.99 / 1.09 | loses / borderline | +| low-k/high-n (10000,5) | any | 0.45–1.07 | not proven — but <60 ms absolute | +| high-k/high-n (≥2000,≥500) | **extreme** (rand/tall) | 0.44–0.56 | not proven | + +**The only cells where Jansson is not proven to lose are** (a) sub-60 ms +absolute times (low-k/very-high-n — a 2× gap is <30 ms, irrelevant), or +(b) **extreme-conflict** input (≈ independent random trees), whose majority +consensus is a near-empty star — not something anyone computes. Realistic +consensus inputs (bootstrap / Bayesian / MPT sets, even conflicting gene-tree +sets) are concordant-to-moderate, and there Jansson provably loses. + +This is consistent with the authors' own C++ `Maj_Rule_Plus` timings (~10× +slower than TreeTools' hashed at small n). + +**Net:** building, proving, and CRAN-hardening `One_Way_Compatible` + +`Merge_Trees` + a dynamic delete/insert tree — the exact machinery FACT got +wrong — to maybe win an unrealistic corner by a few ms is a bad trade against +"correctness paramount / avoid redundant code". **Re-openable**: if you want the +deterministic-O(kn) guarantee regardless, the algorithm is +Maj_Rule_Plus Phase 1 (Fig. 2 of arXiv:1307.7821) + a standard-majority Phase 2 +(keep `count > k·p` instead of `K(v) > Q(v)`); subroutine specs in +PLAN-consensus.md. Say the word. + +## The optimisation that shipped (C-001) + +`count_splits_hashed` no longer materialises every distinct split's bit pattern +eagerly. Each distinct split keeps a 12-byte witness `(tree, L, R)`; the packed +pattern is rebuilt only for splits that reach the consensus threshold (or all, +for `SplitFrequency`). At high n the wasted materialisation of non-surviving +splits was the dominant cost. Verified speedups (min of reps, +`drivers/compare-grids.R`): tall(10000,5) **×13.0**, tall(5000,10) ×8.6, +rand(10000,5) ×2.5, high-k/high-n ×1.5–2.9, median ×1.23. **Results identical to +the deterministic `exact` path in every cell.** Both rewired paths are gated at +shipping scale: `correctness-gate.R` (590 checks) verifies consensus split SETS +at n≤3000 and `SplitFrequency` split sets AND counts at n=2000/5000 (hashed == +exact); package `test-consensus.R` (8/8) and `test-Support.R` (6/6) pass; +`verify-consensus.R` green. + +## What else the profiling found (not yet actioned) + +- **C-002 (open):** at **high-k/low-n** the R wrapper is **57%** of `Consensus()` + wall time; `RenumberTips` is 54% of that. A safe fast-path (batch C++ relabel + / skip when labels already consistent) is the next throughput win in that + regime. Touches shared code → needs the full test suite as a gate. Deferred to + avoid shipping a risky shared-code change while you're away. +- **C-003 (low priority):** hashed's `unordered_map` churn dominates the + low-height extreme-conflict case; only matters for degenerate inputs. +- **Threshold convention (FYI, unchanged):** for 0.5 k·p`; ape keeps `>= k·p`; roxygen says "p or more". Flagging — your call. + +## Correctness fixes made en route + +- `dev/red-team/verify-consensus.R` was **dead**: it called + `consensus_tree(..., hash=FALSE)` but the arg is `exact` → "unused argument". + Fixed (`exact = TRUE`); now green (0 failures). +- Built a method-pluggable gate (`correctness-gate.R`) — hashed==exact==ape@0.5, + 588 checks. (Found & fixed a `which.min()` bug in it that had been + silently skipping every ape comparison.) diff --git a/dev/profiling/PLAN-consensus.md b/dev/profiling/PLAN-consensus.md new file mode 100644 index 000000000..08bfa9bb4 --- /dev/null +++ b/dev/profiling/PLAN-consensus.md @@ -0,0 +1,121 @@ +# Consensus optimisation & Jansson comparison — plan + +Date opened: 2026-06-02. Author: Claude (Opus 4.8), at MRS's request (user unavailable; no plan approval possible — proceeding after advisor consult). + +## Goal (user's words) + +> Aggressively /profile the new Consensus implementation. Fast throughput at +> high-k/low-n, high-n/low-k, and high-n/high-k. We have an O(kn) impl that +> doesn't mirror Jansson's; implement Jansson's algo in parallel; optimise both; +> use data to decide which wins where. Crossover ⇒ switch near it; one [almost] +> always wins (within a couple %) ⇒ keep just that, drop redundant code. +> Correctness paramount: **no systematic error**. Micro-probability of split +> collision is acceptable. + +## What the code does today (`src/consensus.cpp`, `R/Consensus.R`) + +- **Strict (p = 1)**: single-reference O(kn) pass over tree 0 (`CLUSTONL/R` + + contiguity test). Already optimal. *Out of scope.* +- **Majority / threshold (0.5 ≤ p < 1)**: count every split's frequency in one + pass, keep those with `count >= thresh` where + `thresh = min(floor(k·p)+1, k)` (i.e. strictly > k·p). Correct *by + construction*: every kept split occurs in > k/2 trees ⇒ pairwise- (hence + globally-) compatible ⇒ `splits_to_edge` forms a tree with no merge step. + Two counting cores share one postorder primitive (`for_each_internal_node`): + - `count_splits_hashed` — **default**. O(kn). 128-bit splitmix64 sum-hash of + leaf ids, accumulated incrementally up the tree. Probabilistic (≈1e-30 + collision). Tiny constant: one postorder pass, O(1)/node. + - `count_splits_exact` — `hash = FALSE`. O(k·n·h). Builds an exact packed + leaf-set bitmask key by iterating L..R per cluster (cost = Σ cluster sizes = + O(n·h)). Deterministic. Slow for tall trees (h ~ n). + +## Jansson's algorithm (arXiv 1307.7821 = SODA/RECOMB 2013; J.ACM 2016) + +- Deterministic O(kn) **standard majority** lives in ref [18] (RECOMB 2013). + The arXiv paper's `Maj_Rule_Plus` (Fig. 2) is the same Phase-1 candidate + generation; only the keep-test differs. +- **Phase 1** (candidate generation, ⊇ all majority clusters): maintain a + candidate *tree* T (start = T_1, all counts 1). For each later tree T_j: + Boyer–Moore-style update — `+1` if a candidate cluster occurs in T_j, `-1` if + incompatible with T_j, unchanged if compatible-but-absent; delete count-0 + nodes (top-down); then insert T_j's clusters that are compatible with T but + absent. Uses **Day's algorithm** (= our `ClusterTable`, O(1) "occurs in?" + after O(n) prep), **`One_Way_Compatible`** and **`Merge_Trees`** (ref [17], + SODA/J.ACM — still need to fetch their exact procedures). +- **Phase 2 (standard majority)**: recount K(v) = #trees each surviving + candidate occurs in; keep iff K(v) > k·p (same threshold as today). (The + arXiv paper's Maj+ Phase 2 instead keeps K(v) > Q(v); we do **not** want + Maj+ — it is a *different, more resolved* tree and would be a systematic + error vs `ape::consensus`.) + +Derivation soundness: majority ⊆ majority(+), so the Phase-1 candidate set +(⊇ all Maj+ clusters, Lemma 4) ⊇ all majority clusters for any p ≥ 0.5. Phase 2 +keeps exactly the > k·p subset; survivors live in tree T so are mutually +compatible ⇒ valid tree. ✓ + +## Why this is subtle / the key prior + +- **FACT (Jansson's own C++ reference impl) majority code is BROKEN** (verified + by build 2026-05; see memory `fact-majority-broken.md` + this dir). Crashes / + drops majority splits. ⇒ Implement from the **paper with proofs**, never port + FACT. The bug is in FACT's *code*, not Jansson's *algorithm*. +- **The current method is already O(kn) (hashed) and correct-by-construction**, + needing no merge — a genuine simplification that Jansson does not have. +- **Early evidence hashed dominates**: the paper's own C++ `Maj_Rule_Plus` + timings (their 2.2 GHz box) are ≈10× slower than TreeTools' historical hashed + numbers (e.g. (k1000,n100): FACT 1.29 s vs TreeTools ≈0.13 s). Jansson does + several O(n) passes/tree (Day prep, 2× One_Way_Compatible, Merge_Trees, BST); + hashed does one O(1)/node pass. So Jansson's realistic edge is **determinism** + + **beating `exact` on tall trees** (O(kn) vs O(knh)), NOT beating hashed. + +## Framing the comparison + +Jansson competes with **`exact`** (the deterministic path), not with `hashed`. +- vs `hashed`: user has waived collision risk; Jansson must also be *faster* to + win, which the analysis says it won't be. Expect hashed = default everywhere. +- vs `exact`: Jansson O(kn) should beat exact O(knh) for **tall, high-n** trees; + exact's smaller constant may win for **short/balanced or small** trees. A real + crossover may exist *within the deterministic option*. + +## Regimes & covariates to benchmark + +(k, n): high-k/low-n (k≈1–5k, n≈20–50); low-k/high-n (k≈5–20, n≈2–10k); +high-k/high-n (k≈500–2k, n≈1–5k). **Covariates that matter as much as (k,n):** +- **Tree shape / height h**: balanced (h~log n) vs pectinate (h~n). Drives + exact & Jansson. *Must vary.* (Existing benches use random n=100 only.) +- **Concordance**: identical / highly-concordant / random-independent. Drives + #distinct splits (hashmap size; Jansson insert-delete churn). +- **p**: 0.5 and 0.75. + +## Phases + +- **P0 — Scaffold + harness.** dev/profiling/ skeleton; deterministic + cross-regime benchmark grid (hashed vs exact first); upgrade + verify-consensus.R into the correctness gate (hashed == exact == ape across a + big grid + adversarial fixtures). Every new method must pass this gate. +- **P1 — Profile & optimise existing.** profvis the R wrapper (RenumberTips, + Preorder, metadata strip, splits_to_edge, RootTree — likely dominate at + low-n/high-k); VTune the C++ core. Verify each fix with std::chrono micro-bench. + Prime candidates: incremental bitmask-OR for `exact` (O(knh/W)); ClusterTable + ctor cost (k constructions); hashmap reserve/flat-map for hashed; trim R wrapper. +- **P2 — Implement Jansson.** Fetch [17] for One_Way_Compatible/Merge_Trees (or + reconstruct + prove). Add internal third path. Gate through the harness — must + equal hashed/exact/ape on the full grid + adversarial fixtures. +- **P3 — Benchmark all three across grid; analyse crossovers.** Data table + + plots; identify winners per cell. +- **P4 — Integrate decision + document.** Wire dispatch (expected: hashed + default; deterministic fallback = winner of {exact, Jansson}, maybe + shape-switched). Update NEWS, roxygen, memory note. + +## Open questions for advisor + +1. Scope: full Jansson is a large, error-prone C++ effort whose likely payoff + (per the paper's own timings) is "confirm hashed wins." Stage it behind the + harness + exact-optimisation + benchmark, and make *shipping* it conditional + on data showing exact is a real deterministic-path bottleneck? (Still + implement+verify it to GET the data, per the explicit request.) +2. Agree Jansson competes with `exact`, not `hashed`? +3. Any correctness trap in the standard-majority Phase-2 derivation above? +4. Anything in the benchmark design that would let a wrong conclusion slip + through (e.g. measuring only random trees, or core-only timing hiding R + overhead)? diff --git a/dev/profiling/baselines.md b/dev/profiling/baselines.md new file mode 100644 index 000000000..28883a797 --- /dev/null +++ b/dev/profiling/baselines.md @@ -0,0 +1,52 @@ +# Consensus baselines (installed -O2 -g build; R 4.7.0; this machine) + +Core timing = `TreeTools:::consensus_tree(forest, p=0.5)` on RenumberTips+Preorder'd +forests (min of 4 reps after warmup). Full grid in `results-grid.csv`. Driver: +`drivers/consensus-grid.R`. Run 1: 2026-06-02. + +## Headline: hashed dominates exact in EVERY cell (1.1x–3x). No crossover. + +| regime (n,k) | scenario | hashed min(s) | exact min(s) | exact/hashed | +|---------------------|-----------|---------------|--------------|--------------| +| highK_lowN (30,5000)| rand | 0.0647 | 0.0723 | 1.12 | +| highK_lowN (50,2000)| tall | 0.0450 | 0.0495 | 1.10 | +| lowK_highN (10000,5)| rand | 0.0422 | 0.1269 | 3.01 | +| lowK_highN (10000,5)| tall | 0.1885 | 0.2885 | 1.53 | +| lowK_highN (5000,10)| rand | 0.0291 | 0.0743 | 2.55 | +| highK_highN(2000,500)| rand | 0.6648 | 1.2550 | 1.89 | +| highK_highN(2000,500)| tall | 1.2358 | 1.9502 | 1.58 | +| highK_highN(1000,1000)| concord70| 0.2308 | 0.3917 | 1.70 | + +## Cost decomposition (what actually dominates) + +- **R-wrapper overhead** (`Consensus()` end-to-end minus core), by regime: + | (n,k) | Consensus() | core | wrapper | wrapper % | + |--------------|-------------|--------|---------|-----------| + | (30, 5000) | 0.1480 | 0.0636 | 0.0844 | **57%** | + | (100, 100) | 0.0061 | 0.0038 | 0.0023 | 37% | + | (2000, 10) | 0.0112 | 0.0086 | 0.0026 | 23% | + | (1000, 1000) | 0.6081 | 0.5814 | 0.0267 | 4% | + ⇒ At **high-k/low-n** the R wrapper (RenumberTips / metadata strip / Preorder) + is the bottleneck, NOT the counter. This is a named target regime. + +- **ClusterTable construction** dominates at high-n: tall-vs-rand gap at fixed + (n,k) for hashed (e.g. n=10000,k=5: rand 0.042 vs tall 0.189) reflects the + O(n·h) split-materialisation + construction work shared by both counters; the + hashed/exact gap is secondary to this shared floor. + +## Implication for Jansson + +Jansson (deterministic O(kn)) competes with `exact` (deterministic), not hashed. +But (1) hashed already beats exact everywhere by ≤3x; (2) exact never blows up +(shared construction floor dominates); (3) Jansson structurally does ~2k +ClusterTable constructions + per-tree One_Way_Compatible + Merge_Trees + Day +match + delete/insert — several× hashed's single pass — and the authors' own C++ +(Maj_Rule_Plus) benchmarks ~10x slower than TreeTools hashed. So Jansson is very +unlikely to win any regime. The high-leverage throughput wins are the **R +wrapper** (high-k/low-n) and **ClusterTable construction** (high-n). + +## Threshold-convention note (separate finding, not a bug I introduced) + +For 0.5 < p < 1, TreeTools keeps `count > k*p` (`thresh=floor(k*p)+1`); ape keeps +`count >= k*p`. They agree at p=0.5 and p=1. The roxygen says "p or more" (>=) but +the code is strict (>). Surfacing for MRS; out of scope to change here. diff --git a/dev/profiling/correctness-gate.R b/dev/profiling/correctness-gate.R new file mode 100644 index 000000000..1dfae68b3 --- /dev/null +++ b/dev/profiling/correctness-gate.R @@ -0,0 +1,197 @@ +# Correctness gate for consensus methods. +# +# THE rule (user): "any systematic error is unacceptable". The hard gate is that +# every TreeTools consensus counting method produces the IDENTICAL set of +# consensus splits on every fixture; ape::consensus() is an independent oracle +# (soft-reported, since its tie convention at count == k/2 can differ). +# +# Method-pluggable: add Jansson to `methods` once implemented and it is gated +# automatically. Loads the installed -O2 build named in .libpath.txt (NOT +# load_all, so the gate reflects shipped behaviour). +# +# Run: Rscript dev/profiling/correctness-gate.R +suppressWarnings(suppressMessages({ + libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1] + library(TreeTools, lib.loc = libpath) +})) + +# ---- canonical split sets (label-based; polarity-normalised to exclude the +# alphabetically-first label; trivial splits dropped) ----------------------- +# From a packed RawMatrix (rows = splits, bit i = tip i+1 in the shared numbering +# given by ref_labels). +splits_from_matrix <- function(m, ref_labels) { + n <- length(ref_labels) + excl <- 1L # tip 1 (shared numbering) always on excluded side + if (is.null(m) || nrow(m) == 0) return(character(0)) + out <- character(nrow(m)) + for (r in seq_len(nrow(m))) { + bits <- as.logical(rawToBits(as.raw(m[r, ])))[seq_len(n)] + if (isTRUE(bits[excl])) bits <- !bits # excluded label on FALSE side + side <- which(bits) + if (length(side) < 2 || length(side) > n - 2) next + out[r] <- paste(sort(side), collapse = ",") + } + sort(unique(out[out != ""])) +} + +# From a phylo, expressed in the shared index space defined by ref_labels. +splits_from_phylo <- function(tree, ref_labels) { + n <- length(ref_labels) + excl <- 1L + pp <- ape::prop.part(tree) + labs <- attr(pp, "labels") + out <- character(0) + for (cl in pp) { + idx <- match(labs[cl], ref_labels) + if (anyNA(idx)) stop("label mismatch in splits_from_phylo") + if (excl %in% idx) idx <- setdiff(seq_len(n), idx) # exclude first label + if (length(idx) < 2 || length(idx) > n - 2) next + out <- c(out, paste(sort(idx), collapse = ",")) + } + sort(unique(out)) +} + +# ---- method registry: forest (already RenumberTips+Preorder) , p -> split set -- +ref_labels_of <- function(forest) forest[[1]][["tip.label"]] + +methods <- list( + hashed = function(forest, p) splits_from_matrix( + TreeTools:::consensus_tree(forest, p, exact = FALSE), ref_labels_of(forest)), + exact = function(forest, p) splits_from_matrix( + TreeTools:::consensus_tree(forest, p, exact = TRUE), ref_labels_of(forest)) + # jansson = function(forest, p) splits_from_matrix( + # TreeTools:::consensus_tree(forest, p, method = "jansson"), ref_labels_of(forest)) +) +# ape oracle (soft): only defined for p in [0.5, 1]. +ape_oracle <- function(forest, p) + splits_from_phylo(ape::consensus(forest, p = p), ref_labels_of(forest)) + +fails <- 0L; soft <- 0L; checks <- 0L +hard_fail <- function(msg) { cat(" HARD FAIL:", msg, "\n"); fails <<- fails + 1L } +soft_fail <- function(msg) { cat(" (soft) ape disagree:", msg, "\n"); soft <<- soft + 1L } + +# Compare all registered methods to each other (HARD) and to ape (SOFT). +gate_one <- function(forest, p, tag) { + checks <<- checks + 1L + res <- lapply(methods, function(f) f(forest, p)) + base <- res[[1]] + for (nm in names(res)[-1]) { + if (!identical(base, res[[nm]])) + hard_fail(sprintf("%s p=%.3g: %s != %s", tag, p, names(res)[1], nm)) + } + # ape is a valid oracle ONLY at p = 0.5: there ">half" matches TreeTools' + # `count > k*p`. For p > 0.5, ape keeps `count >= k*p` (>=) while TreeTools + # keeps `count > k*p` (>) — a deliberate, documented-as-"p or more" but + # actually-strict TreeTools convention. So we only cross-check ape at p=0.5; + # the threshold path (p>0.5) is gated by internal hashed==exact==jansson + # agreement (the `count >= thresh` filter is shared by all methods). + if (isTRUE(all.equal(p, 0.5))) { + ok_ape <- tryCatch(ape_oracle(forest, p), + error = function(e) structure("ERR", msg = conditionMessage(e))) + if (identical(as.vector(ok_ape), "ERR")) { + soft_fail(sprintf("%s p=0.5: ape_oracle ERRORED: %s", tag, attr(ok_ape, "msg"))) + } else if (!identical(base, ok_ape)) { + soft_fail(sprintf("%s p=0.5 (n kept: ours=%d ape=%d)", + tag, length(base), length(ok_ape))) + } + } +} + +prep <- function(forest) { + forest <- structure(lapply(forest, function(t) { + t[["edge.length"]] <- NULL; t[["node.label"]] <- NULL; t + }), class = "multiPhylo") + Preorder(RenumberTips(forest, forest[[1]])) +} + +ps <- c(0.5, 0.55, 0.6, 2/3, 0.75, 0.8, 0.9, 0.99) + +set.seed(1) +cat("== random forests (shape: rtree) ==\n") +for (n_tip in c(4, 5, 6, 8, 13, 20, 50)) { + for (k in c(2, 3, 5, 7, 8, 15, 32, 60)) { + forest <- prep(lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL))) + for (p in ps) gate_one(forest, p, sprintf("rand n=%d k=%d", n_tip, k)) + } +} + +cat("== tall forests (random caterpillars) ==\n") +for (n_tip in c(8, 20, 60)) { + labs <- paste0("t", seq_len(n_tip)) + for (k in c(3, 7, 16)) { + forest <- prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs)))) + for (p in ps) gate_one(forest, p, sprintf("tall n=%d k=%d", n_tip, k)) + } +} + +cat("== adversarial structured fixtures ==\n") +adversarial <- list( + all_identical = rep(list(BalancedTree(8)), 7), + star_disagree = list(ape::read.tree(text = "((a,b),(c,d));"), + ape::read.tree(text = "((a,c),(b,d));")), + single_split = list(ape::read.tree(text = "((a,b,c,d),(e,f,g));"), + ape::read.tree(text = "((a,b,c,d),(e,f,g));")), + one_off_bal = c(rep(list(BalancedTree(10)), 3), list(PectinateTree(10))), + threshold_tie = c(rep(list(BalancedTree(8)), 2), rep(list(PectinateTree(8)), 2)), + mixed_resoln = list(BalancedTree(8), CollapseNode(BalancedTree(8), 11:12), + PectinateTree(8)), + bal_vs_pec = list(BalancedTree(8), PectinateTree(8))[c(1,1,1,1,2,2,2)] +) +for (nm in names(adversarial)) { + forest <- prep(adversarial[[nm]]) + for (p in ps) gate_one(forest, p, nm) +} + +# ---- HIGH-N content gate (verifies the deferred-materialisation witness path +# at n where survivors are actually rebuilt from witnesses; the rest of the +# gate only reaches n<=50). hashed split-SET must equal exact split-SET. ---- +cat("== high-n content (deferred-materialisation witness path) ==\n") +set.seed(11) +for (n_tip in c(1000L, 3000L)) { + labs <- paste0("t", seq_len(n_tip)) + hi <- list( + # survivor-heavy: ~n-3 majority splits get materialised from witnesses + concord70 = { base <- ape::rtree(n_tip, br = NULL); nb <- 9L + c(rep(list(base), nb), lapply(seq_len(4L), + function(i) ape::rtree(n_tip, br = NULL, tip.label = base$tip.label))) }, + identical = rep(list(ape::rtree(n_tip, br = NULL)), 12L), + # 0-survivor witness path (random caterpillars -> star consensus) + tall = lapply(seq_len(12L), function(i) PectinateTree(sample(labs))) + ) + for (nm in names(hi)) { + forest <- prep(hi[[nm]]) + for (p in c(0.5, 2/3)) gate_one(forest, p, sprintf("HIGHN %s n=%d", nm, n_tip)) + } +} + +# ---- HIGH-N SplitFrequency content+COUNT gate. The witness change also rewired +# calc_split_frequencies_hashed -> frequencies_list_from_witnesses, whose +# materialise-all + RawMatrix packing is otherwise only checked at n<=30. +# hashed must equal exact as a (split -> count) MAP at shipping scale. ------ +cat("== high-n SplitFrequency (hashed vs exact: split sets AND counts) ==\n") +freq_keyset <- function(forest, exact) { + sf <- TreeTools:::split_frequencies(forest, exact = exact) + m <- sf[["splits"]]; cnt <- sf[["counts"]] + if (is.null(m) || nrow(m) == 0) return(character(0)) + n <- length(ref_labels_of(forest)) + out <- character(nrow(m)) + for (r in seq_len(nrow(m))) { + bits <- as.logical(rawToBits(as.raw(m[r, ])))[seq_len(n)] + if (isTRUE(bits[1])) bits <- !bits # tip 1 on excluded side (as elsewhere) + out[r] <- paste0(paste(which(bits), collapse = ","), "=", cnt[r]) + } + sort(out) +} +set.seed(23) +for (n_tip in c(2000L, 5000L)) { + forest <- prep(lapply(seq_len(12L), function(i) ape::rtree(n_tip, br = NULL))) + checks <- checks + 1L + if (!identical(freq_keyset(forest, FALSE), freq_keyset(forest, TRUE))) { + hard_fail(sprintf("SplitFrequency hashed != exact (split/count map) n=%d", n_tip)) + } +} + +cat(sprintf("\n==== %s ==== %d checks; %d HARD fails; %d soft (ape) disagreements\n", + if (fails == 0) "PASS" else "FAIL", checks, fails, soft)) +if (soft > 0) cat("NOTE: soft ape disagreements are usually count==k*p tie-convention; inspect if unexpected.\n") +quit(status = if (fails == 0) 0L else 1L) diff --git a/dev/profiling/drivers/compare-grids.R b/dev/profiling/drivers/compare-grids.R new file mode 100644 index 000000000..3d6bcc18f --- /dev/null +++ b/dev/profiling/drivers/compare-grids.R @@ -0,0 +1,34 @@ +# Compare two benchmark-grid CSVs (baseline vs optimised) and print per-cell +# speedups. Run after consensus-grid.R has overwritten results-grid.csv. +# Run: Rscript dev/profiling/drivers/compare-grids.R [old.csv] [new.csv] +a <- commandArgs(TRUE) +oldf <- if (length(a) >= 1) a[1] else "dev/profiling/results-grid-baseline.csv" +newf <- if (length(a) >= 2) a[2] else "dev/profiling/results-grid.csv" +o <- read.csv(oldf); n <- read.csv(newf) +key <- function(d) paste(d$regime, d$scenario, d$n, d$k, d$method, sep = "|") +o$key <- key(o); n$key <- key(n) +m <- merge(o[, c("key", "min_s", "nsplits")], + n[, c("key", "min_s", "nsplits")], by = "key", + suffixes = c("_old", "_new")) +m$speedup <- round(m$min_s_old / m$min_s_new, 2) +m$changed_nsplits <- m$nsplits_old != m$nsplits_new # MUST be all FALSE (correctness) +parts <- do.call(rbind, strsplit(m$key, "|", fixed = TRUE)) +m$regime <- parts[, 1]; m$scenario <- parts[, 2]; m$n <- as.integer(parts[, 3]) +m$k <- as.integer(parts[, 4]); m$method <- parts[, 5] +m <- m[order(m$method, m$regime, m$n, m$k, m$scenario), ] + +cat("== per-cell speedup (old/new min_s); >1 = faster ==\n") +for (i in seq_len(nrow(m))) { + cat(sprintf("%-7s %-11s %-9s n=%-5d k=%-5d %7.4f -> %7.4f x%-5.2f %s\n", + m$method[i], m$regime[i], m$scenario[i], m$n[i], m$k[i], + m$min_s_old[i], m$min_s_new[i], m$speedup[i], + if (m$changed_nsplits[i]) " *** NSPLITS CHANGED ***" else "")) +} +if (any(m$changed_nsplits)) { + cat("\n!!! CORRECTNESS REGRESSION: nsplits changed in some cell !!!\n") +} else { + cat("\nnsplits identical in every cell (no correctness change).\n") +} +cat(sprintf("hashed median speedup: x%.2f; exact median speedup: x%.2f\n", + median(m$speedup[m$method == "hashed"]), + median(m$speedup[m$method == "exact"]))) diff --git a/dev/profiling/drivers/consensus-grid.R b/dev/profiling/drivers/consensus-grid.R new file mode 100644 index 000000000..f6e1ee2c8 --- /dev/null +++ b/dev/profiling/drivers/consensus-grid.R @@ -0,0 +1,121 @@ +# Baseline benchmark grid for consensus counting methods. +# +# Times the C++ CORE (TreeTools:::consensus_tree) on forests that are already +# RenumberTips+Preorder'd, so we compare ALGORITHMS apples-to-apples (the shared +# R-wrapper cost is measured separately at the end). Covaries the three regimes +# the user named (high-k/low-n, low-k/high-n, high-k/high-n) with tree SHAPE +# (random vs tall caterpillar) and CONCORDANCE (random / 70%-concordant / +# identical) — the covariates that drive exact's O(k.n.h) and (later) Jansson. +# +# Method dispatch is a single place (`run_method`) so Jansson slots in later. +# Writes a tidy CSV; guards pathological exact cells with a warmup probe. +# +# Run: Rscript dev/profiling/drivers/consensus-grid.R +suppressWarnings(suppressMessages({ + libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1] + library(TreeTools, lib.loc = libpath) +})) +set.seed(5813) + +OUT <- "dev/profiling/results-grid.csv" +REPS <- 4L # timed reps after 1 warmup +PROBE_BUDGET_S <- 8 # if the warmup rep exceeds this, record it alone & skip reps + +timeit <- function(expr) { + expr <- substitute(expr) + env <- parent.frame() + eval(expr, env) # warmup (also a probe) + t0 <- Sys.time(); eval(expr, env) + warm <- as.numeric(Sys.time() - t0, units = "secs") + if (warm > PROBE_BUDGET_S) return(list(min = warm, med = warm, reps = 1L)) + ts <- numeric(REPS) + for (i in seq_len(REPS)) { + t0 <- Sys.time(); eval(expr, env) + ts[i] <- as.numeric(Sys.time() - t0, units = "secs") + } + list(min = min(ts), med = stats::median(ts), reps = REPS) +} + +# ---- forest generators (each returns a RenumberTips+Preorder'd multiPhylo) ---- +prep <- function(forest) { + forest <- structure(forest, class = "multiPhylo") + Preorder(RenumberTips(forest, forest[[1]])) +} +gen <- list( + rand = function(n, k) prep(lapply(seq_len(k), function(i) ape::rtree(n, br = NULL))), + tall = function(n, k) { labs <- paste0("t", seq_len(n)) + prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs)))) }, + concord70 = function(n, k) { base <- ape::rtree(n, br = NULL) + nb <- ceiling(0.7 * k) + prep(c(rep(list(base), nb), + lapply(seq_len(k - nb), function(i) ape::rtree(n, br = NULL, + tip.label = base$tip.label)))) }, + identical = function(n, k) { base <- ape::rtree(n, br = NULL) + prep(rep(list(base), k)) } +) + +# ---- method dispatch (Jansson added here later) ------------------------------- +run_method <- function(forest, p, method) { + switch(method, + hashed = TreeTools:::consensus_tree(forest, p, exact = FALSE), + exact = TreeTools:::consensus_tree(forest, p, exact = TRUE), + # jansson = TreeTools:::consensus_tree(forest, p, method = "jansson"), + stop("unknown method")) +} +METHODS <- c("hashed", "exact") + +# ---- grid: (regime, n, k) ----------------------------------------------------- +grid <- rbind( + data.frame(regime = "baseline", n = 100, k = 100), + data.frame(regime = "highK_lowN", n = 30, k = 1000), + data.frame(regime = "highK_lowN", n = 30, k = 5000), + data.frame(regime = "highK_lowN", n = 50, k = 2000), + data.frame(regime = "lowK_highN", n = 2000, k = 10), + data.frame(regime = "lowK_highN", n = 5000, k = 10), + data.frame(regime = "lowK_highN", n = 10000, k = 5), + data.frame(regime = "highK_highN", n = 1000, k = 300), + data.frame(regime = "highK_highN", n = 1000, k = 1000), + data.frame(regime = "highK_highN", n = 2000, k = 500) +) +SCENARIOS <- names(gen) +P <- 0.5 + +rows <- list() +first <- TRUE +for (gi in seq_len(nrow(grid))) { + n <- grid$n[gi]; k <- grid$k[gi]; regime <- grid$regime[gi] + for (sc in SCENARIOS) { + tg0 <- Sys.time() + forest <- gen[[sc]](n, k) + gen_s <- as.numeric(Sys.time() - tg0, units = "secs") + for (m in METHODS) { + tr <- timeit(run_method(forest, P, m)) + res <- run_method(forest, P, m) + nsplits <- if (is.null(res)) 0L else nrow(res) + row <- data.frame(regime = regime, scenario = sc, n = n, k = k, + method = m, p = P, min_s = round(tr$min, 5), + med_s = round(tr$med, 5), reps = tr$reps, + nsplits = nsplits, gen_s = round(gen_s, 3)) + rows[[length(rows) + 1]] <- row + cat(sprintf("%-12s %-9s n=%-5d k=%-5d %-7s min=%8.4f med=%8.4f reps=%d nsplits=%d\n", + regime, sc, n, k, m, tr$min, tr$med, tr$reps, nsplits)) + utils::write.table(row, OUT, sep = ",", row.names = FALSE, + col.names = first, append = !first) + first <- FALSE + } + } +} + +# ---- R-wrapper overhead: Consensus() (full pipeline) vs core-only ------------- +cat("\n== wrapper overhead (end-to-end Consensus vs core consensus_tree) ==\n") +for (cfg in list(c(30, 5000), c(100, 100), c(2000, 10), c(1000, 1000))) { + n <- cfg[1]; k <- cfg[2] + forest <- gen$rand(n, k) + # raw (unprepared) copy to feed Consensus(), so its RenumberTips/Preorder run + raw <- structure(lapply(seq_len(k), function(i) forest[[i]]), class = "multiPhylo") + te <- timeit(Consensus(raw, p = 0.5, check.labels = TRUE)) + tc <- timeit(TreeTools:::consensus_tree(forest, 0.5, exact = FALSE)) + cat(sprintf("n=%-5d k=%-5d Consensus()=%8.4f core=%8.4f wrapper=%8.4f (%.0f%%)\n", + n, k, te$min, tc$min, te$min - tc$min, 100 * (te$min - tc$min) / te$min)) +} +cat("\nWrote", OUT, "\n") diff --git a/dev/profiling/drivers/jansson-bound.R b/dev/profiling/drivers/jansson-bound.R new file mode 100644 index 000000000..dd53fc0f1 --- /dev/null +++ b/dev/profiling/drivers/jansson-bound.R @@ -0,0 +1,77 @@ +# Measured lower bound on Jansson (Maj_Rule_Plus) cost, without implementing it. +# +# Bound (advisor-validated): +# Phase 2 of Maj_Rule_Plus ~= k Day-matchings of the candidate tree against +# each input ~= the STRICT path (consensus_tree p=1): k table builds + k O(n) +# matching passes. +# Phase 1 >= Phase 2 (same matching PLUS One_Way_Compatible + Merge_Trees + +# delete/insert + a candidate-table rebuild each iteration). +# => Jansson total >= 2 * strict-path. +# Strict's inner loop (single-reference contiguity + CLUSTONL/R, no hash map) is +# TIGHTER than a real Phase 2, so this UNDER-counts Jansson => a "2*strict >= +# hashed" verdict is conservative. +# +# `rand` forests have NO strict early-exit (strict consensus is a star: 0 of n-3 +# splits found, never hits the perfectly-resolved return) => k full passes, the +# worst case for strict. We also test `tall`. +# +# Verdict per cell: if 2*strict >= hashed, Jansson cannot win there. +# Run: Rscript dev/profiling/drivers/jansson-bound.R +suppressWarnings(suppressMessages({ + libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1] + library(TreeTools, lib.loc = libpath) +})) +set.seed(5813) +REPS <- 4L +timeit <- function(expr) { + expr <- substitute(expr); env <- parent.frame() + eval(expr, env) + ts <- numeric(REPS) + for (i in seq_len(REPS)) { t0 <- Sys.time(); eval(expr, env) + ts[i] <- as.numeric(Sys.time() - t0, units = "secs") } + min(ts) +} +prep <- function(f) { f <- structure(f, class = "multiPhylo"); Preorder(RenumberTips(f, f[[1]])) } +gen <- list( + rand = function(n, k) prep(lapply(seq_len(k), function(i) ape::rtree(n, br = NULL))), + tall = function(n, k) { labs <- paste0("t", seq_len(n)) + prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs)))) }, + # 70% identical base + 30% random: strict consensus is still ~star (the random + # 30% kill the base's splits) so strict does k full passes => bound stays valid, + # while distinct-split count is bounded => a more realistic-leaning case. + concord70 = function(n, k) { base <- ape::rtree(n, br = NULL); nb <- ceiling(0.7 * k) + prep(c(rep(list(base), nb), lapply(seq_len(k - nb), function(i) + ape::rtree(n, br = NULL, tip.label = base$tip.label)))) }, + # MODERATE conflict (realistic gene-tree / bootstrap-with-rogues): each + # replicate = base with ~5% of taxa (rogues) relocated to random edges. The + # majority IS well resolved (rogues each break only a few replicates' backbone + # splits) but strict stays near-star (some replicate breaks each split) => no + # early-exit => bound valid. Distinct-split count ~ n + O(k*rogues): moderate. + moderate = function(n, k) { base <- ape::rtree(n, br = NULL) + m <- min(25L, max(3L, n %/% 50L)) + prep(lapply(seq_len(k), function(i) { + rogues <- sample(base[["tip.label"]], m) + t <- DropTip(base, rogues) + for (r in rogues) t <- AddTip(t, label = r) # default where = random edge + t + })) } +) +cells <- list(c(30,5000), c(50,2000), c(5000,10), c(10000,5), c(2000,500), c(1000,1000)) + +cat(sprintf("%-6s %-5s %-6s %10s %10s %12s %s\n", + "shape","n","k","hashed","strict","2*strict","verdict")) +worst_ratio <- Inf +for (sc in c("rand","tall","concord70","moderate")) for (cell in cells) { + n <- cell[1]; k <- cell[2] + f <- gen[[sc]](n, k) + th <- timeit(TreeTools:::consensus_tree(f, 0.5, exact = FALSE)) # hashed majority + ts <- timeit(TreeTools:::consensus_tree(f, 1)) # strict (Jansson lower-bound unit) + bound <- 2 * ts + verdict <- if (bound >= th) "Jansson can't win" else "INCONCLUSIVE — implement" + worst_ratio <- min(worst_ratio, bound / th) + cat(sprintf("%-6s %-5d %-6d %10.4f %10.4f %12.4f %s (2*strict/hashed=%.2f)\n", + sc, n, k, th, ts, bound, verdict, bound / th)) +} +cat(sprintf("\nWorst-case 2*strict/hashed across all cells: %.2f\n", worst_ratio)) +cat(if (worst_ratio >= 1) "==> CONCLUSIVE: Jansson cannot beat hashed in any tested regime.\n" + else "==> Some cell inconclusive; implement Jansson there and measure.\n") diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md new file mode 100644 index 000000000..e2165aa70 --- /dev/null +++ b/dev/profiling/findings.md @@ -0,0 +1,18 @@ +# Consensus profiling findings + +Task-row format. P = priority (1 highest). Status: OPEN / DONE / WONTFIX. + +| ID | P | Status | Verified Δ | Finding | Evidence | +|----|---|--------|-----------|---------|----------| +| C-001 | 1 | DONE | hashed median **x1.23**; tall(10000,5) **x13.0**, tall(5000,10) x8.6, rand(10000,5) x2.5, tall(2000,500) x2.9; nsplits identical in every cell. | [Optimise] hashed: DEFER split-pattern materialisation until thresholding — keep a `(tree,L,R)` witness per distinct split, build the packed pattern only for splits reaching `thresh`. | At high n the dominant hashed cost is per-distinct-split work, most wasted (few/no splits survive on low-concordance input). Strict path (no materialisation) is 5–30× faster than hashed-majority at high n. Implemented in src/consensus.cpp (SplitWitness + materialise_witness). Gate: 590 checks PASS (hashed==exact==ape@0.5; incl. consensus content n≤3000 and SplitFrequency split+count n=2000/5000); test-consensus 8/8, test-Support 6/6, verify-consensus green. Speedup verified: compare-grids.R. | +| C-002 | 2 | OPEN | — | [Port/Optimise] Consensus() R wrapper: `RenumberTips` is 54% of the wrapper at high-k/low-n; the metadata-strip `lapply` copies every tree (pure overhead when no edge.length/node.label) and downgrades the forest to a plain list (forces per-tree RenumberTips). | Rprof at (n=30,k=5000): RenumberTips.phylo self 22% / total 54%; wrapper = 57% of end-to-end (baselines.md). **Two fixes, different scope:** (a) SAFE/localised — a short-circuit consistency check in Consensus() that SKIPS RenumberTips when all trees already share tree1's tip.label order (a genuine no-op then); ~0s overhead on mismatch, saves ~0.135s/call (~2× end-to-end) when it triggers, BUT only helps already-consistent input (ape::rtree shuffles label ORDER, so the bench's `rand`/`tall` don't trigger it; real bootstrap/posterior sets often do). (b) GENERAL/risky — route genuine relabelling through the C++ batch path `_TreeTools_renumber_tips_batch` (needs a labelled multiPhylo; helps inconsistent input too). Note: passing a precomputed char order is *slower* (TipLabels.character does more). Shared-code risk for (b) — gate with full test suite. | +| C-003 | 3 | OPEN | — | [Optimise] hashed: large `unordered_map` churn dominates the LOW-height high-distinct (`rand`) case (deferred materialisation does NOT help there — clusters are tiny). Try a flat open-addressing map / better reserve. Asymptotic floor is Ω(#distinct) ops; only Jansson's O(n) candidate tree removes it. | jansson-bound.R: rand high-n strict ≪ hashed; the gap is hashmap ops, not materialisation. Realistic (concordant) inputs have few distinct splits ⇒ small map ⇒ low priority. | + +## Jansson decision (tracking) + +Open question: does deterministic Jansson (Maj_Rule_Plus, O(kn)) beat OPTIMISED +hashed in any *realistic* regime? Realistic consensus inputs are concordant (few +distinct splits) where hashed is O(kn) tiny-constant and wins. Jansson's only +edge is high-n LOW-concordance (≈ independent random trees — not a real consensus +input). Lower bound: Jansson ≥ 2×strict. Decision pending grid-v2 + a realistic +high-concordance scenario. See log.md / PLAN-consensus.md. diff --git a/dev/profiling/focus-areas.md b/dev/profiling/focus-areas.md new file mode 100644 index 000000000..efdb6c61c --- /dev/null +++ b/dev/profiling/focus-areas.md @@ -0,0 +1,16 @@ +# Profiling focus areas — Consensus + +Ranked by (per-call cost) × (call frequency on the consensus hot path). +See PLAN-consensus.md for the broader project. + +| # | Area | Files | Why hot | Baseline cost | Last profiled | Status | +|---|------|-------|---------|---------------|---------------|--------| +| 1 | `count_splits_hashed` (majority default) | src/consensus.cpp:267 | One postorder pass/tree; the default majority counter. O(kn). | tbd | — | NEW | +| 2 | `count_splits_exact` (deterministic) | src/consensus.cpp:86 | Builds packed bitkey by L..R loop/cluster ⇒ O(k·n·h); slow for tall trees. | tbd | — | NEW | +| 3 | `ClusterTable` constructor | inst/include/TreeTools/ClusterTable.h:352 | Built once per tree per call ⇒ k constructions; `root_on_node` + allocs. | tbd | — | NEW | +| 4 | `Consensus()` R wrapper | R/Consensus.R:46 | Per-call R overhead: metadata strip (lapply), NTip, RenumberTips, Preorder, splits_to_edge, RootTree. May dominate at low-n/high-k. | tbd | — | NEW | +| 5 | strict single-ref path | src/consensus.cpp:165 | p=1 path; already O(kn) optimal. Profile only to confirm AT-LIMIT. | tbd | — | NEW | +| 6 | `for_each_internal_node` primitive | src/consensus.cpp:44 | Shared postorder/stack arithmetic backing #1,#2,#5. | tbd | — | NEW | +| 7 | Jansson core (to be written) | src/consensus.cpp (new) | Deterministic O(kn) candidate-tree path; compare vs #1,#2. | tbd | — | NEW | + +Parked (called once/session; not in rotation): IO, label checking. diff --git a/dev/profiling/log.md b/dev/profiling/log.md new file mode 100644 index 000000000..6e46def4d --- /dev/null +++ b/dev/profiling/log.md @@ -0,0 +1,88 @@ +# Profiling log — Consensus + +last_focus: 1 + +--- + +## Round 1 — deferred materialisation + Jansson decision (2026-06-02) + +**Optimisation C-001 (DONE, gated, measured).** Hashed counter now defers +split-pattern materialisation: each distinct split keeps a `(tree,L,R)` witness; +only splits reaching `thresh` are materialised (consensus), or all (split_freq). +Gate: 576 checks PASS. Measured (compare-grids.R): hashed median x1.23, with the +wins where it was weakest — **tall(10000,5) x13.0**, tall(5000,10) x8.6, +rand(10000,5) x2.5, tall/rand high-k/high-n x1.5–2.9. nsplits identical in every +cell (no behaviour change). exact untouched (x1.02). + +**Jansson decision (bound re-run vs OPTIMISED hashed, + concord70).** +- Concordant (realistic) high-k/high-n: 2*strict ≥ hashed (x1.24–1.32) ⇒ + **Jansson can't win**. Same for all high-k/low-n. +- **Moderate conflict** (realistic gene-tree / bootstrap-with-rogues: base + ~5% + rogue taxa relocated/replicate) at high-k/high-n: 2*strict ≥ hashed + (2000,500: 1.54; 1000,1000: 1.42) ⇒ **Jansson can't win**. low-k/high-n + (5000,10): 1.09 ⇒ loses. +- Only NOT-proven cells: (10000,5) (any concordance) — but <60ms absolute, a 2× + gap is irrelevant; and high-k/high-n EXTREME conflict (rand/tall) — degenerate + near-star consensus, not a real input. +- **DECISION (advisor-confirmed): do NOT implement full Jansson.** Proven to lose + (2*strict ≥ hashed) in every realistic, time-meaningful regime. Rationale: + user's "avoid redundant code"; high correctness risk (FACT ships broken + majority); collision risk waived. Honest scope: this is a rigorous *lower-bound* + result where it concludes — not a direct Jansson measurement; the un-proven + cells are negligible-time or degenerate. Re-openable (algorithm + subroutine + specs in PLAN-consensus.md / DECISION.md). See **DECISION.md** (user-facing). + +**Advisor passes (2):** (1) flagged compare-grids' nsplits check is COUNT not +CONTENT ⇒ added high-n (n=1000/3000) hashed==exact split-SET content cells; +confirmed verify-consensus.R green; added moderate-conflict generator. (2) flagged +the symmetric gap — the witness change also rewired SplitFrequency's hashed +assembly, only checked at n≤30 ⇒ added high-n (n=2000/5000) SplitFrequency +hashed==exact split-SET+COUNT cells. Gate now 590 checks PASS. Every modified +code path gated at shipping scale (+ test-consensus 8/8, test-Support 6/6). +Advisor also corrected its earlier "commit" → do NOT commit (on main, not asked). + +**Next (if continuing):** C-002 wrapper fast-path (high-k/low-n, 57% overhead) — +the remaining real throughput win, in a target regime. Shared-code (RenumberTips) +risk ⇒ gate with full test suite. + +--- + +## Round 0 — scaffold + baseline + Jansson lower bound (2026-06-02) + +**Setup.** Installed -O2 -g build into scratch lib (`.libpath.txt`); all drivers +load from it (not load_all). Built the correctness gate +(`correctness-gate.R`, 576 checks: hashed==exact everywhere; ape agrees at p=0.5) +and the benchmark grid (`drivers/consensus-grid.R` → `results-grid.csv`, +summarised in `baselines.md`). + +**Correctness fixes made en route (real bugs):** +- gate `which.min()` → `integer(0)` silently skipped EVERY ape check; + fixed to tip-index 1. Hard hashed==exact gate was still valid (both used the + same broken-but-consistent canonicaliser). +- `dev/red-team/verify-consensus.R` called `consensus_tree(forest, p, hash=FALSE)` + but the export arg is `exact` → "unused argument" → that adversarial gate had + been DEAD since the rename. Fixed to `exact = TRUE` (lines 49, 77). +- Threshold-convention finding (surfaced to MRS, not changed): TreeTools keeps + `count > k*p`; ape keeps `>= k*p` for p>0.5; roxygen says "p or more" (>=) but + code is strict (>). Out of scope here. + +**Baseline result.** hashed beats exact in all 80 cells (1.1x–3x). R-wrapper is +57% of end-to-end at high-k/low-n (30,5000). See `baselines.md`. + +**Jansson lower bound** (`drivers/jansson-bound.R`; advisor-validated: Jansson ≥ +2×strict-path). Verdict per cell: +- low-n/high-k: 2*strict ≥ hashed (ratio 1.1–1.4) ⇒ **Jansson can't win** there. +- HIGH-n (all shapes): 2*strict ≪ hashed (ratio 0.07–0.51) ⇒ **INCONCLUSIVE**. + Strict is 5–30× faster than hashed-majority at high-n because hashed handles + ~k·n DISTINCT splits (big hash map + per-split materialisation) while strict + matches against ONE reference's O(n) clusters. Jansson's candidate tree is also + O(n) clusters ⇒ it plausibly shares strict's advantage and could BEAT hashed at + high-n. **Must implement Jansson to settle this** (user asked; advisor concurs). + +**Caveat noticed:** cost structure is subtler than "materialisation dominates" +(identical is *slower* than rand at (10000,5) despite fewer distinct splits) ⇒ +must VTune the core, not guess, before optimising it. + +**Next:** (R1) profvis the R wrapper at high-k/low-n → fast-path. (R2) VTune the +hashed/exact core at high-n to find the true hotspot. (R3) implement Jansson +(R-prototype-first, gated) and benchmark vs optimised hashed. diff --git a/src/consensus.cpp b/src/consensus.cpp index a8bc892e7..309701bfd 100644 --- a/src/consensus.cpp +++ b/src/consensus.cpp @@ -19,6 +19,26 @@ using TreeTools::ClusterTable; struct StackEntry { int32 L, R, N, W; }; +// A distinct split's "witness": tree index t and the [L, R] encode-range that +// the split spans in THAT tree's own ClusterTable encoding (where every cluster +// is contiguous). The split's leaf set is reconstructed on demand as +// {tables[t].DECODE(j) : L <= j <= R}. Lets the hashed counter defer building +// the packed bit pattern until thresholding, so non-surviving splits (the bulk, +// on low-concordance input) are never materialised. +struct SplitWitness { int32 t, L, R; }; + +// Reconstruct a split's packed bit pattern from its witness into row `dst` +// (nbin bytes, pre-zeroed by caller). tables is non-const (DECODE mutates none +// of the observable state but is not const-qualified). +inline void materialise_witness(std::vector& tables, + const SplitWitness& w, Rbyte* const dst) { + ClusterTable& tab = tables[w.t]; + for (int32 j = w.L; j <= w.R; ++j) { + const int32 leaf_idx = tab.DECODE(j) - 1; // 0-based original leaf id + dst[leaf_idx >> 3] |= static_cast(1 << (leaf_idx & 7)); + } +} + // Throttled (~1 s) user-interrupt check. inline void throttled_interrupt( std::chrono::steady_clock::time_point& last) { @@ -123,8 +143,7 @@ void count_splits_exact(std::vector& tables, const int32 n_tip, // for the majority/threshold consensus. Non-dependent name in the template // below, so it must be declared first. void count_splits_hashed(std::vector& tables, const int32 n_tip, - const int32 nbin, - std::vector>& split_patterns, + std::vector& witnesses, std::vector& counts); // --------------------------------------------------------------------------- @@ -209,16 +228,11 @@ RawMatrix calc_consensus_tree( if (splits_found == ntip_3) return ret; } } - } else { - // ---- Majority / threshold: count then threshold ----------------------- + } else if (exact) { + // ---- Majority / threshold, deterministic: count then threshold -------- std::vector> split_patterns; std::vector counts; - if (exact) { - count_splits_exact(tables, n_tip, nbin, S, split_patterns, counts); - } else { - count_splits_hashed(tables, n_tip, nbin, split_patterns, counts); - } - + count_splits_exact(tables, n_tip, nbin, S, split_patterns, counts); const int32 n_distinct = int32(split_patterns.size()); for (int32 i = 0; i < n_distinct; ++i) { if (counts[i] >= thresh) { @@ -229,6 +243,25 @@ RawMatrix calc_consensus_tree( if (splits_found == ntip_3) return ret; } } + } else { + // ---- Majority / threshold, hashed: count, then materialise ONLY the + // splits that reach the threshold (the rest are never built). -------- + std::vector witnesses; + std::vector counts; + count_splits_hashed(tables, n_tip, witnesses, counts); + const int32 n_distinct = int32(witnesses.size()); + std::vector row(nbin); + for (int32 i = 0; i < n_distinct; ++i) { + if (counts[i] >= thresh) { + std::fill(row.begin(), row.end(), Rbyte(0)); + materialise_witness(tables, witnesses[i], row.data()); + for (int32 c = 0; c < nbin; ++c) { + ret(splits_found, c) = row[c]; + } + ++splits_found; + if (splits_found == ntip_3) return ret; + } + } } return (splits_found == 0) ? RawMatrix(0, nbin) : @@ -241,8 +274,10 @@ RawMatrix calc_consensus_tree( // A single O(kn) pass: each non-trivial cluster is identified by a 128-bit // subtree hash = the (order-independent) sum of its leaves' fixed splitmix64 // hashes, so the same split in different trees hashes identically without an -// O(cluster size) key. Counts accumulate directly; the bit pattern is -// materialised only the first time a split is seen. Exactness is therefore +// O(cluster size) key. Counts accumulate directly; materialisation of the +// packed bit pattern is DEFERRED — each distinct split keeps only a witness +// (tree + encode-range), so the bulk of splits that never reach the consensus +// threshold are never materialised (the dominant cost at high n). Exactness is // probabilistic (a 128-bit collision, ~1e-30, would conflate two splits). inline uint64_t splitmix64(uint64_t x) { x += 0x9e3779b97f4a7c15ULL; @@ -265,8 +300,7 @@ struct Hash128Hasher { }; void count_splits_hashed(std::vector& tables, const int32 n_tip, - const int32 nbin, - std::vector>& split_patterns, + std::vector& witnesses, std::vector& counts) { const int32 n_trees = int32(tables.size()); const int32 ntip_3 = n_tip - 3; @@ -324,13 +358,10 @@ void count_splits_hashed(std::vector& tables, const int32 n_tip, const Hash128 hkey{hlo, hhi}; auto it = split_map.find(hkey); if (it == split_map.end()) { - split_map.emplace(hkey, int32(split_patterns.size())); - std::vector pattern(nbin, 0); - for (int32 j = L; j <= R; ++j) { - const int32 leaf_idx = tree.DECODE(j) - 1; - pattern[leaf_idx >> 3] |= (Rbyte(1) << (leaf_idx & 7)); - } - split_patterns.push_back(std::move(pattern)); + split_map.emplace(hkey, int32(witnesses.size())); + // Defer materialisation: record where this split lives (tree t, + // encode-range [L, R]) and rebuild the pattern later only if needed. + witnesses.push_back({t, L, R}); counts.push_back(1); } else { ++counts[it->second]; @@ -356,6 +387,27 @@ inline List frequencies_list( return List::create(Named("splits") = ret, Named("counts") = count_vec); } +// As frequencies_list, but materialising each split's pattern from its witness +// (split_frequencies needs every distinct split, so all are built — same total +// work as eager, just deferred, with less peak memory: no vector-of-patterns). +inline List frequencies_list_from_witnesses( + std::vector& tables, + const std::vector& witnesses, + const std::vector& counts, const int32 nbin) { + const int32 splits_found = int32(witnesses.size()); + RawMatrix ret(splits_found, nbin); + std::vector row(nbin); + for (int32 r = 0; r < splits_found; ++r) { + std::fill(row.begin(), row.end(), Rbyte(0)); + materialise_witness(tables, witnesses[r], row.data()); + for (int32 c = 0; c < nbin; ++c) { + ret(r, c) = row[c]; + } + } + IntegerVector count_vec(counts.begin(), counts.end()); + return List::create(Named("splits") = ret, Named("counts") = count_vec); +} + List calc_split_frequencies_hashed(const List& trees, const int32 n_tip) { const int32 n_trees = trees.length(); std::vector tables; @@ -364,10 +416,10 @@ List calc_split_frequencies_hashed(const List& trees, const int32 n_tip) { tables.emplace_back(ClusterTable(Rcpp::List(trees(i)))); } const int32 nbin = (n_tip + 7) / 8; - std::vector> split_patterns; + std::vector witnesses; std::vector counts; - count_splits_hashed(tables, n_tip, nbin, split_patterns, counts); - return frequencies_list(split_patterns, counts, nbin); + count_splits_hashed(tables, n_tip, witnesses, counts); + return frequencies_list_from_witnesses(tables, witnesses, counts, nbin); } // Exact split frequencies: the same single-pass count, returned in full. From e089388ae3be2aceece45cdd7e2be26bc521dbfd Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 09:05:31 +0100 Subject: [PATCH 04/11] Relabel unlabelled forests in C++ in RenumberTips() RenumberTips() on an unlabelled multiPhylo or list previously relabelled each tree with a separate R call (RenumberTips.phylo per element). It now relabels the whole forest in a single C++ pass (renumber_tips_to), deriving each tree's permutation from one hash of the target labels, with a no-op pass-through for trees already in target order. The C++ helper returns NULL to fall back to the existing per-tree R path for anything it would not handle identically: numeric tipOrder (per-tree target), a label set differing from the target (different length, unknown/NA label, or non-bijection), duplicate/NA target labels, or non-phylo list elements. List names are preserved. Results are unchanged. 1.8-9x faster (9x at high-k/low-n: 5000 trees of 30 tips, 0.335s -> 0.037s); Consensus() and every other RenumberTips() caller share the win. Full test suite green; new equivalence test confirms the C++ path matches the per-tree path across mixed orderings. Telling label-match without looking: only a labelled multiPhylo (TipLabel attribute) guarantees a shared order without inspecting each tree; absent that, inspection is unavoidable but now done in C++. Co-Authored-By: Claude Opus 4.8 --- NEWS.md | 4 ++ R/RcppExports.R | 4 ++ R/tree_numbering.R | 35 +++++++-- dev/profiling/drivers/renumber-bench.R | 41 +++++++++++ dev/profiling/findings.md | 3 +- src/RcppExports.cpp | 13 ++++ src/renumber_tips.cpp | 98 ++++++++++++++++++++++++++ tests/testthat/test-tree_numbering.R | 25 ++++++- 8 files changed, 217 insertions(+), 6 deletions(-) create mode 100644 dev/profiling/drivers/renumber-bench.R diff --git a/NEWS.md b/NEWS.md index b4ad6f37e..9db32f329 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,10 @@ until it is needed, so splits that never reach the consensus threshold are no longer built. Identical results; up to ~13× faster for large trees (greatest gains for tall trees / many tips), with no change at small sizes. +- `RenumberTips()` relabels an unlabelled `multiPhylo` or `list` of trees in a + single C++ pass instead of a per-tree R loop, with a no-op fast path for trees + already in the target order. Speeds up `Consensus()` and other callers when + combining many trees; results are unchanged. # TreeTools 2.4.0 (2026-06-02) # diff --git a/R/RcppExports.R b/R/RcppExports.R index 54c14a345..243e2273e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -101,6 +101,10 @@ renumber_tips_batch <- function(trees, perm, n_tip, new_labels) { .Call(`_TreeTools_renumber_tips_batch`, trees, perm, n_tip, new_labels) } +renumber_tips_to <- function(trees, target) { + .Call(`_TreeTools_renumber_tips_to`, trees, target) +} + cpp_edge_to_splits <- function(edge, order, nTip) { .Call(`_TreeTools_cpp_edge_to_splits`, edge, order, nTip) } diff --git a/R/tree_numbering.R b/R/tree_numbering.R index ff6cdc2c5..dd6aad23b 100644 --- a/R/tree_numbering.R +++ b/R/tree_numbering.R @@ -601,6 +601,25 @@ TntOrder.NULL <- function(tree) NULL #' @export RenumberTips <- function(tree, tipOrder) UseMethod("RenumberTips") +# Fast path shared by the unlabelled multiPhylo and list methods: relabel an +# entire forest to a single target order in one C++ call, instead of dispatching +# RenumberTips.phylo per tree in R. Returns the relabelled list, or NULL to +# signal that the caller should fall back to the per-tree R path (the C++ helper +# declines anything it would not handle identically: see renumber_tips_to()). +.RenumberForestToC <- function(tree, tipOrder) { + if (is.numeric(tipOrder)) { + # Numeric tipOrder indexes each tree's OWN labels, so the target differs per + # tree; only the R path handles that. + return(NULL) + } + newOrder <- TipLabels(tipOrder, single = TRUE) + if (anyDuplicated(newOrder)) { + # Let RenumberTips.phylo raise its "repeated in tipOrder" error. + return(NULL) + } + renumber_tips_to(tree, newOrder) +} + #' @rdname RenumberTips #' @export RenumberTips.phylo <- function(tree, tipOrder) { @@ -667,9 +686,16 @@ RenumberTips.multiPhylo <- function(tree, tipOrder) { labelled <- !is.null(at[["TipLabel"]]) if (!labelled) { - # Unlabelled: each tree may have different tip orderings, - # so must process individually - tree <- lapply(tree, RenumberTips.phylo, tipOrder) + # Unlabelled: each tree may have its own tip ordering. Relabel the whole + # forest in one C++ pass; fall back to the per-tree R path for inputs the + # fast path declines (numeric tipOrder, differing label sets, non-phylo + # elements, ...). + relabelled <- .RenumberForestToC(tree, tipOrder) + tree <- if (is.null(relabelled)) { + lapply(tree, RenumberTips.phylo, tipOrder) + } else { + relabelled + } attributes(tree) <- at return(tree) } @@ -726,7 +752,8 @@ RenumberTips.multiPhylo <- function(tree, tipOrder) { #' @rdname RenumberTips #' @export RenumberTips.list <- function(tree, tipOrder) { - lapply(tree, RenumberTips, tipOrder) + relabelled <- .RenumberForestToC(tree, tipOrder) + if (is.null(relabelled)) lapply(tree, RenumberTips, tipOrder) else relabelled } #' @rdname RenumberTips diff --git a/dev/profiling/drivers/renumber-bench.R b/dev/profiling/drivers/renumber-bench.R new file mode 100644 index 000000000..cef90770e --- /dev/null +++ b/dev/profiling/drivers/renumber-bench.R @@ -0,0 +1,41 @@ +# Benchmark the RenumberTips unlabelled fast path (C++ renumber_tips_to) against +# the per-tree R path it replaces, and the resulting Consensus() wrapper share. +# Run: Rscript dev/profiling/drivers/renumber-bench.R +suppressWarnings(suppressMessages({ + libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1] + library(TreeTools, lib.loc = libpath) +})) +set.seed(5813) +REPS <- 6L +tt <- function(expr) { e <- substitute(expr); env <- parent.frame() + eval(e, env); ts <- numeric(REPS) + for (i in seq_len(REPS)) { t0 <- Sys.time(); eval(e, env) + ts[i] <- as.numeric(Sys.time() - t0, units = "secs") } + min(ts) } + +# Per-tree R reference path (what RenumberTips.multiPhylo used to do for an +# unlabelled forest): apply RenumberTips to each tree individually. +r_path <- function(forest, target) lapply(forest, RenumberTips, target) + +cat(sprintf("%-6s %-6s %12s %12s %9s\n", "n", "k", "R per-tree", "C++ batch", "speedup")) +for (cfg in list(c(30,5000), c(50,2000), c(100,1000), c(500,300), c(2000,100))) { + n <- cfg[1]; k <- cfg[2] + forest <- lapply(seq_len(k), function(i) ape::rtree(n, br = NULL)) # shuffled orders + target <- paste0("t", seq_len(n)) + mp <- structure(forest, class = "multiPhylo") + tR <- tt(r_path(forest, target)) + tC <- tt(RenumberTips(mp, target)) + cat(sprintf("%-6d %-6d %12.4f %12.4f %8.2fx\n", n, k, tR, tC, tR / tC)) +} + +cat("\n== Consensus() end-to-end vs core (new wrapper share) ==\n") +for (cfg in list(c(30,5000), c(100,100), c(2000,10))) { + n <- cfg[1]; k <- cfg[2] + forest <- lapply(seq_len(k), function(i) ape::rtree(n, br = NULL)) + raw <- structure(forest, class = "multiPhylo") + pre <- Preorder(RenumberTips(raw, raw[[1]])) + te <- tt(Consensus(raw, p = 0.5)) + tc <- tt(TreeTools:::consensus_tree(pre, 0.5, exact = FALSE)) + cat(sprintf("n=%-5d k=%-5d Consensus()=%8.4f core=%8.4f wrapper=%8.4f (%.0f%%)\n", + n, k, te, tc, te - tc, 100 * (te - tc) / te)) +} diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md index e2165aa70..ec7d11927 100644 --- a/dev/profiling/findings.md +++ b/dev/profiling/findings.md @@ -5,7 +5,8 @@ Task-row format. P = priority (1 highest). Status: OPEN / DONE / WONTFIX. | ID | P | Status | Verified Δ | Finding | Evidence | |----|---|--------|-----------|---------|----------| | C-001 | 1 | DONE | hashed median **x1.23**; tall(10000,5) **x13.0**, tall(5000,10) x8.6, rand(10000,5) x2.5, tall(2000,500) x2.9; nsplits identical in every cell. | [Optimise] hashed: DEFER split-pattern materialisation until thresholding — keep a `(tree,L,R)` witness per distinct split, build the packed pattern only for splits reaching `thresh`. | At high n the dominant hashed cost is per-distinct-split work, most wasted (few/no splits survive on low-concordance input). Strict path (no materialisation) is 5–30× faster than hashed-majority at high n. Implemented in src/consensus.cpp (SplitWitness + materialise_witness). Gate: 590 checks PASS (hashed==exact==ape@0.5; incl. consensus content n≤3000 and SplitFrequency split+count n=2000/5000); test-consensus 8/8, test-Support 6/6, verify-consensus green. Speedup verified: compare-grids.R. | -| C-002 | 2 | OPEN | — | [Port/Optimise] Consensus() R wrapper: `RenumberTips` is 54% of the wrapper at high-k/low-n; the metadata-strip `lapply` copies every tree (pure overhead when no edge.length/node.label) and downgrades the forest to a plain list (forces per-tree RenumberTips). | Rprof at (n=30,k=5000): RenumberTips.phylo self 22% / total 54%; wrapper = 57% of end-to-end (baselines.md). **Two fixes, different scope:** (a) SAFE/localised — a short-circuit consistency check in Consensus() that SKIPS RenumberTips when all trees already share tree1's tip.label order (a genuine no-op then); ~0s overhead on mismatch, saves ~0.135s/call (~2× end-to-end) when it triggers, BUT only helps already-consistent input (ape::rtree shuffles label ORDER, so the bench's `rand`/`tall` don't trigger it; real bootstrap/posterior sets often do). (b) GENERAL/risky — route genuine relabelling through the C++ batch path `_TreeTools_renumber_tips_batch` (needs a labelled multiPhylo; helps inconsistent input too). Note: passing a precomputed char order is *slower* (TipLabels.character does more). Shared-code risk for (b) — gate with full test suite. | +| C-002 | 2 | DONE | RenumberTips **1.8–9.0×** (30,5000: 0.335→0.037s 9.0×; 50,2000 6.7×; 100,1000 6.3×; 500,300 2.6×; 2000,100 1.8×). Results identical. | [Optimise] `RenumberTips()` on an unlabelled multiPhylo/list now relabels the whole forest in ONE C++ pass (`renumber_tips_to`), replacing the per-tree R loop; no-op pass-through for trees already in target order. Shared by Consensus() and all RenumberTips callers. Returns NULL → existing R fallback for numeric tipOrder / differing label sets / non-phylo elements, so edge-case behaviour is preserved exactly. | **"Tell without looking?"** Only a *labelled* multiPhylo (TipLabel attr) guarantees a shared order without inspecting each tree; otherwise inspection is unavoidable, but now in C++ (and a matching tree is passed through untouched). Gate: full suite GREEN (296); correctness-gate 590; new equivalence test (C++ == per-tree across mixed orderings). LESSON: the existing RenumberTips test caught that the C++ list initially dropped `names` (the lapply path preserves them) — fixed by copying input names onto the result. Bench: drivers/renumber-bench.R. | +| C-004 | 2 | OPEN | — | [Optimise] Consensus() metadata-strip `lapply(c(trees), \\(tr) {tr$edge.length<-NULL; tr$node.label<-NULL; tr})` copies EVERY tree unconditionally; with RenumberTips now fast it is the dominant remaining wrapper cost at high-k/low-n. The C++ core never reads edge.length/node.label. Options: strip only when present; or drop the strip and let RenumberTips' clone carry (bigger clone for trees WITH lengths, none for br=NULL). Consensus()-local, low risk. | renumber-bench.R: at (30,5000) wrapper ≈ 43–63% of Consensus() AFTER the RenumberTips speedup; RenumberTips is now only ~25% of the wrapper, the strip + Preorder + result assembly the rest. | | C-003 | 3 | OPEN | — | [Optimise] hashed: large `unordered_map` churn dominates the LOW-height high-distinct (`rand`) case (deferred materialisation does NOT help there — clusters are tiny). Try a flat open-addressing map / better reserve. Asymptotic floor is Ω(#distinct) ops; only Jansson's O(n) candidate tree removes it. | jansson-bound.R: rand high-n strict ≪ hashed; the gap is hashmap ops, not materialisation. Realistic (concordant) inputs have few distinct splits ⇒ small map ⇒ low priority. | ## Jansson decision (tracking) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 36c2de209..4a9110e0d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -326,6 +326,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// renumber_tips_to +SEXP renumber_tips_to(Rcpp::List trees, const Rcpp::CharacterVector target); +RcppExport SEXP _TreeTools_renumber_tips_to(SEXP treesSEXP, SEXP targetSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::List >::type trees(treesSEXP); + Rcpp::traits::input_parameter< const Rcpp::CharacterVector >::type target(targetSEXP); + rcpp_result_gen = Rcpp::wrap(renumber_tips_to(trees, target)); + return rcpp_result_gen; +END_RCPP +} // cpp_edge_to_splits Rcpp::RawMatrix cpp_edge_to_splits(const Rcpp::IntegerMatrix& edge, const Rcpp::IntegerVector& order, const Rcpp::IntegerVector& nTip); RcppExport SEXP _TreeTools_cpp_edge_to_splits(SEXP edgeSEXP, SEXP orderSEXP, SEXP nTipSEXP) { @@ -562,6 +574,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeTools_node_depth_unrooted", (DL_FUNC) &_TreeTools_node_depth_unrooted, 4}, {"_TreeTools_path_lengths", (DL_FUNC) &_TreeTools_path_lengths, 3}, {"_TreeTools_renumber_tips_batch", (DL_FUNC) &_TreeTools_renumber_tips_batch, 4}, + {"_TreeTools_renumber_tips_to", (DL_FUNC) &_TreeTools_renumber_tips_to, 2}, {"_TreeTools_cpp_edge_to_splits", (DL_FUNC) &_TreeTools_cpp_edge_to_splits, 3}, {"_TreeTools_duplicated_splits", (DL_FUNC) &_TreeTools_duplicated_splits, 2}, {"_TreeTools_mask_splits", (DL_FUNC) &_TreeTools_mask_splits, 1}, diff --git a/src/renumber_tips.cpp b/src/renumber_tips.cpp index 437fd7d70..3573ff7f2 100644 --- a/src/renumber_tips.cpp +++ b/src/renumber_tips.cpp @@ -1,6 +1,11 @@ #include using namespace Rcpp; +#include /* for fill */ +#include /* for string (label keys) */ +#include /* for unordered_map */ +#include /* for vector */ + // Apply a precomputed tip permutation to every tree in batch. // Returns a list of modified phylo objects (cloned, with new // edge matrices, updated tip.label, and "preorder" downgraded to "cladewise"). @@ -57,3 +62,96 @@ Rcpp::List renumber_tips_batch( return result; } + +// Renumber every tree in an UNLABELLED forest so its tips follow `target`'s +// order, entirely in C++. Unlike renumber_tips_batch (one shared permutation +// for a labelled multiPhylo), each tree may carry its own tip.label order, so a +// per-tree permutation is derived from a single hash of `target`. This is the +// fast path for RenumberTips on a plain list / unlabelled multiPhylo, replacing +// a per-tree R loop. +// +// Returns the relabelled list, OR R_NilValue to tell the R caller to fall back +// to the per-tree R path (RenumberTips.phylo). NULL is returned for anything +// this fast path does not handle identically to RenumberTips.phylo: an element +// that is not a phylo-like list or lacks `edge`/`tip.label`; a tree whose label +// set differs from `target` (different length, an unknown or NA label, or a +// non-bijection / duplicate); or duplicate/NA labels in `target` itself. The +// common case — every tree is a permutation of `target` — is handled here. +// +// A tree already in `target` order is returned unchanged (no clone), matching +// RenumberTips.phylo's no-op return. +// [[Rcpp::export]] +SEXP renumber_tips_to(Rcpp::List trees, const Rcpp::CharacterVector target) { + const int n_tip = target.size(); + const int n_trees = trees.size(); + if (n_tip == 0 || n_trees == 0) return R_NilValue; + + // target label -> 1-based new tip index + std::unordered_map pos; + pos.reserve(n_tip * 2); + for (int i = 0; i < n_tip; ++i) { + SEXP s = STRING_ELT(target, i); + if (s == NA_STRING) return R_NilValue; + pos.emplace(std::string(CHAR(s)), i + 1); + } + if (static_cast(pos.size()) != n_tip) return R_NilValue; // dup target labels + + Rcpp::List result(n_trees); + std::vector perm(n_tip + 1); + std::vector used(n_tip + 1); + + for (int t = 0; t < n_trees; ++t) { + SEXP orig = trees[t]; + if (!Rf_isVectorList(orig)) return R_NilValue; + Rcpp::List tree_i(orig); + if (!tree_i.containsElementNamed("tip.label") || + !tree_i.containsElementNamed("edge")) return R_NilValue; + + SEXP lab = tree_i["tip.label"]; + if (TYPEOF(lab) != STRSXP || Rf_length(lab) != n_tip) return R_NilValue; + + std::fill(used.begin(), used.end(), char(0)); + bool already = true; + for (int i = 0; i < n_tip; ++i) { + SEXP s = STRING_ELT(lab, i); + if (s == NA_STRING) return R_NilValue; + auto it = pos.find(std::string(CHAR(s))); + if (it == pos.end()) return R_NilValue; // label not in target + const int new_idx = it->second; + if (used[new_idx]) return R_NilValue; // non-bijection (dup label) + used[new_idx] = 1; + perm[i + 1] = new_idx; + if (new_idx != i + 1) already = false; + } + + if (already) { // no relabel needed + result[t] = orig; + continue; + } + + SEXP orig_class = Rf_getAttrib(orig, R_ClassSymbol); + Rcpp::List tree_c = Rcpp::clone(tree_i); + Rcpp::IntegerMatrix edge = + Rcpp::clone(Rcpp::as(tree_c["edge"])); + const int n_edge = edge.nrow(); + for (int j = 0; j < n_edge; ++j) { + int& child = edge(j, 1); + if (child <= n_tip) child = perm[child]; + } + tree_c["edge"] = edge; + tree_c["tip.label"] = target; + if (tree_c.hasAttribute("order")) { + Rcpp::CharacterVector ord = tree_c.attr("order"); + if (ord.size() && ord[0] == "preorder") { + tree_c.attr("order") = Rcpp::CharacterVector::create("cladewise"); + } + } + if (orig_class != R_NilValue) { + Rf_setAttrib(Rcpp::wrap(tree_c), R_ClassSymbol, orig_class); + } + result[t] = tree_c; + } + // Preserve list names (the R lapply path this replaces keeps them). + result.attr("names") = trees.attr("names"); + return result; +} diff --git a/tests/testthat/test-tree_numbering.R b/tests/testthat/test-tree_numbering.R index 22d75bfe8..9dfa51b03 100644 --- a/tests/testthat/test-tree_numbering.R +++ b/tests/testthat/test-tree_numbering.R @@ -177,7 +177,7 @@ test_that("RenumberTips.multiPhylo() covers edge cases", { result <- RenumberTips(mp8, 8:1) expect_equal(TipLabels(result[[1]]), paste0("t", 8:1)) - # No-op when order already matches (unlabelled: per-tree fallback) + # No-op when order already matches (unlabelled: C++ pass-through, unchanged) result_noop <- RenumberTips(mp8, TipLabels(mp8[[1]])) expect_equal(result_noop[[1]][["edge"]], mp8[[1]][["edge"]]) expect_equal(result_noop[[2]][["edge"]], mp8[[2]][["edge"]]) @@ -201,6 +201,29 @@ test_that("RenumberTips.multiPhylo() covers edge cases", { ) }) +test_that("RenumberTips() unlabelled C++ path matches the per-tree path", { + # The unlabelled multiPhylo / list fast path (renumber_tips_to in C++) must + # produce results IDENTICAL to applying RenumberTips.phylo per tree, across + # mixed tip orderings (incl. trees already in target order -> pass-through). + set.seed(91) + canon <- paste0("t", 1:12) + forest <- c( + list(BalancedTree(canon), PectinateTree(canon)), # already in order + lapply(1:6, function(i) ape::rtree(12, br = NULL)) # random orders + ) + for (target in list(canon, rev(canon))) { + ref <- lapply(forest, function(tr) RenumberTips(tr, target)) # per-tree R path + mp <- RenumberTips(structure(forest, class = "multiPhylo"), target) + ls <- RenumberTips(forest, target) # list method + for (i in seq_along(forest)) { + expect_identical(mp[[i]][["edge"]], ref[[i]][["edge"]]) + expect_identical(mp[[i]][["tip.label"]], ref[[i]][["tip.label"]]) + expect_identical(attr(mp[[i]], "order"), attr(ref[[i]], "order")) + expect_identical(ls[[i]][["edge"]], ref[[i]][["edge"]]) + } + } +}) + test_that("RenumberTips.multiPhylo() materializes tip.label", { # TipLabel-labelled multiPhylo: individual trees should gain tip.label # even when the order already matches (needed by RoguePlot and others From 4f18550684f5ca8e402f98fdd7eedb465227dc8a Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 09:44:59 +0100 Subject: [PATCH 05/11] Avoid copying every tree in Consensus() metadata strip Consensus() stripped edge.length/node.label from every input tree via a per-tree lapply that forced a copy of each tree. The consensus core reads only edge + tip.label and the output tree is rebuilt from scratch, so neither field can affect the result; the strip's only load-bearing job was coercing a labelled multiPhylo's shared tip labels onto each tree, which bare c() already does. Replace the strip with 'trees <- c(trees)' (~7x cheaper than the strip: 0.82->0.11 ms at k=201), cutting wrapper overhead ~25% on small forests of many short trees (forest201.80 3.38->2.54 ms), ~6% on forest1k.888 (strict C++ core dominates there). Output verified byte-identical: 108-cell before/after grid (plain / unaligned / edge.length / node.label / both / labelled / labelled+edge.length / list / differing-size x p x check.labels x hash) and a 36-cell core metadata-independence check. New test-Consensus.R block covers the previously untested metadata-bearing and labelled-multiPhylo paths. Full suite 4192 PASS; correctness gate 590 checks; verify-consensus green. Co-Authored-By: Claude Opus 4.8 --- NEWS.md | 4 ++ R/Consensus.R | 16 ++++--- dev/profiling/drivers/c004-equiv.R | 53 ++++++++++++++++++++ dev/profiling/drivers/gha-bench-diagnose.R | 56 ++++++++++++++++++++++ dev/profiling/findings.md | 25 +++++++++- tests/testthat/test-consensus.R | 29 +++++++++++ 6 files changed, 176 insertions(+), 7 deletions(-) create mode 100644 dev/profiling/drivers/c004-equiv.R create mode 100644 dev/profiling/drivers/gha-bench-diagnose.R diff --git a/NEWS.md b/NEWS.md index 9db32f329..162f9ef02 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,10 @@ single C++ pass instead of a per-tree R loop, with a no-op fast path for trees already in the target order. Speeds up `Consensus()` and other callers when combining many trees; results are unchanged. +- `Consensus()` no longer copies every input tree to strip branch lengths and + node labels (the consensus core ignores both); it now coerces in place, + trimming wrapper overhead (~25% faster on small forests of many short trees). + Results are unchanged. # TreeTools 2.4.0 (2026-06-02) # diff --git a/R/Consensus.R b/R/Consensus.R index 484a7dc23..300514e69 100644 --- a/R/Consensus.R +++ b/R/Consensus.R @@ -54,12 +54,16 @@ Consensus <- function(trees, p = 1, check.labels = TRUE, hash = TRUE) { stop("Expecting `trees` to be a list.") } - # Remove irrelevant metadata so we don't waste time processing it - trees <- lapply(c(trees), function(tr) { - tr[["edge.length"]] <- NULL - tr[["node.label"]] <- NULL - tr - }) + # Coerce to a plain list of standalone trees. c() materialises a labelled + # multiPhylo's shared tip labels onto each tree (the C++ core reads each + # tree's own `tip.label`), exactly as the previous strip-lapply did. We + # deliberately do NOT strip `edge.length` / `node.label`: the consensus core + # reads only `edge` + `tip.label`, and the output tree is rebuilt from + # scratch, so neither field can affect the result. The old per-tree + # `tr[["edge.length"]] <- NULL` forced a copy of every tree for no gain; + # bare c() is ~7x cheaper on metadata-free forests (the common case) and + # output-identical (verified: dev/profiling/drivers/c004-equiv.R, 108 cells). + trees <- c(trees) repeat { nTip <- NTip(trees) diff --git a/dev/profiling/drivers/c004-equiv.R b/dev/profiling/drivers/c004-equiv.R new file mode 100644 index 000000000..28076ac7f --- /dev/null +++ b/dev/profiling/drivers/c004-equiv.R @@ -0,0 +1,53 @@ +# C-004 regression: full Consensus() output must be byte-identical before/after +# replacing the metadata-strip lapply with bare c(trees). +# Run once on the OLD code (saves ref), once on the NEW code (compares). +suppressMessages(devtools::load_all(".", quiet = TRUE)) +set.seed(42) +ref_path <- "dev/profiling/c004-ref.rds" + +base <- lapply(1:9, function(i) ape::rtree(12, br = NULL)) +mp <- structure(RenumberTips(structure(base, class = "multiPhylo"), base[[1]]), + class = "multiPhylo") # aligned order +mpU <- structure(base, class = "multiPhylo") # UNaligned order +addEL <- function(f) structure(lapply(f, function(t){t$edge.length<-seq_len(nrow(t$edge)); t}), class="multiPhylo") +addNL <- function(f) structure(lapply(f, function(t){t$node.label<-paste0("n",seq_len(t$Nnode)); t}), class="multiPhylo") +diff_sz <- structure(c(base[1:5], lapply(base[6:9], function(t) DropTip(t, 1:2))), class="multiPhylo") + +variants <- list( + plain = mp, + unaligned = mpU, # exercises check.labels=TRUE relabel + edge_len = addEL(mp), + node_lab = addNL(mp), + both = addNL(addEL(mp)), + labelled = ape::.compressTipLabel(mp), + lab_el = ape::.compressTipLabel(addEL(mp)), + list_in = unclass(mp), # plain list, not multiPhylo + diff_size = diff_sz # exercises the KeepTip repeat loop +) +grid <- expand.grid(v = names(variants), p = c(0.5, 2/3, 1), + check = c(TRUE, FALSE), hash = c(TRUE, FALSE), + stringsAsFactors = FALSE) +canon <- function(tr) { + if (!inherits(tr, "phylo")) return(tr) + list(edge = Preorder(RootTree(tr, 1))$edge, tip = tr$tip.label, Nnode = tr$Nnode) +} +out <- Map(function(v, p, check, hash) + tryCatch(canon(suppressWarnings(Consensus(variants[[v]], p, check, hash))), + error = function(e) paste("ERR:", conditionMessage(e))), + grid$v, grid$p, grid$check, grid$hash) + +if (!file.exists(ref_path)) { + saveRDS(out, ref_path) + cat(sprintf("Saved reference: %d cells -> %s\n", length(out), ref_path)) +} else { + ref <- readRDS(ref_path) + same <- mapply(identical, ref, out) + if (all(same)) { + cat(sprintf("PASS: all %d Consensus() outputs identical to reference.\n", length(out))) + } else { + cat(sprintf("FAIL: %d/%d cells differ:\n", sum(!same), length(out))) + bad <- which(!same) + for (i in bad) cat(sprintf(" v=%-9s p=%.3g check=%s hash=%s\n", + grid$v[i], grid$p[i], grid$check[i], grid$hash[i])) + } +} diff --git a/dev/profiling/drivers/gha-bench-diagnose.R b/dev/profiling/drivers/gha-bench-diagnose.R new file mode 100644 index 000000000..c57dfbb9e --- /dev/null +++ b/dev/profiling/drivers/gha-bench-diagnose.R @@ -0,0 +1,56 @@ +# Why do the GHA bench-Consensus.R cells show NSD after C-001/C-002? +# Diagnose: which path each benchmark exercises + wrapper cost breakdown. +suppressMessages(devtools::load_all(".", quiet = TRUE)) + +forest201.80 <- as.phylo(0:200, 80) +forest21.260 <- as.phylo(0:20, 260) +forest1k.888 <- as.phylo(0:1000, 888) +forestMaj <- c(as.phylo(rep(c(0, 1e9), c(15, 50)), 100), + structure(lapply(rep(100, 50), BalancedTree), class = "multiPhylo")) + +describe <- function(nm, f) { + cls <- class(f) + tl <- attr(f, "TipLabel") + has_el <- any(vapply(f, function(t) !is.null(t[["edge.length"]]), logical(1))) + has_nl <- any(vapply(f, function(t) !is.null(t[["node.label"]]), logical(1))) + # do all trees share tip.label in identical order? + if (is.null(tl)) { + labs <- lapply(f, function(t) t[["tip.label"]]) + same_order <- all(vapply(labs[-1], identical, logical(1), labs[[1]])) + } else same_order <- NA # labelled: order shared by construction + cat(sprintf("%-14s k=%-5d n=%-4d class=%-10s TipLabel=%-5s edge.len=%-5s node.lab=%-5s sameTipOrder=%s\n", + nm, length(f), NTip(f[[1]]), paste(cls, collapse=","), + !is.null(tl), has_el, has_nl, same_order)) +} +cat("== forest structure ==\n") +describe("forest201.80", forest201.80) +describe("forest21.260", forest21.260) +describe("forest1k.888", forest1k.888) +describe("forestMaj", forestMaj) + +# ---- Wrapper cost breakdown (replicate Consensus() internals) ----------------- +bk <- function(label, expr, times = 25) { + expr <- substitute(expr) + e <- parent.frame() + t <- replicate(times, { g <- gc(FALSE); s <- Sys.time(); eval(expr, e); as.numeric(Sys.time() - s, units = "secs") }) + cat(sprintf(" %-26s %8.2f ms (min %6.2f)\n", label, 1000*median(t), 1000*min(t))) + invisible(median(t)) +} + +breakdown <- function(nm, trees, p = 1, check.labels = TRUE) { + cat(sprintf("\n== %s (p=%g, check.labels=%s) ==\n", nm, p, check.labels)) + hash <- TRUE + bk("full Consensus()", Consensus(trees, p, check.labels, hash)) + bk("metadata-strip lapply", lapply(c(trees), function(tr) { tr[["edge.length"]] <- NULL; tr[["node.label"]] <- NULL; tr })) + stripped <- lapply(c(trees), function(tr) { tr[["edge.length"]] <- NULL; tr[["node.label"]] <- NULL; tr }) + bk("NTip(trees)", NTip(stripped)) + if (check.labels) bk("RenumberTips", RenumberTips(stripped, stripped[[1]])) + rt <- if (check.labels) RenumberTips(stripped, stripped[[1]]) else stripped + bk("Preorder(trees[[1]])", Preorder(rt[[1]])) + bk("consensus_tree core", TreeTools:::consensus_tree(rt, p, exact = !hash)) +} + +breakdown("forest1k.888", forest1k.888, p = 1, check.labels = FALSE) +breakdown("forest201.80", forest201.80, p = 1, check.labels = FALSE) +breakdown("forest21.260", forest21.260, p = 1, check.labels = TRUE) +breakdown("forestMaj", forestMaj, p = 0.5, check.labels = FALSE) diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md index ec7d11927..8b2570988 100644 --- a/dev/profiling/findings.md +++ b/dev/profiling/findings.md @@ -6,8 +6,31 @@ Task-row format. P = priority (1 highest). Status: OPEN / DONE / WONTFIX. |----|---|--------|-----------|---------|----------| | C-001 | 1 | DONE | hashed median **x1.23**; tall(10000,5) **x13.0**, tall(5000,10) x8.6, rand(10000,5) x2.5, tall(2000,500) x2.9; nsplits identical in every cell. | [Optimise] hashed: DEFER split-pattern materialisation until thresholding — keep a `(tree,L,R)` witness per distinct split, build the packed pattern only for splits reaching `thresh`. | At high n the dominant hashed cost is per-distinct-split work, most wasted (few/no splits survive on low-concordance input). Strict path (no materialisation) is 5–30× faster than hashed-majority at high n. Implemented in src/consensus.cpp (SplitWitness + materialise_witness). Gate: 590 checks PASS (hashed==exact==ape@0.5; incl. consensus content n≤3000 and SplitFrequency split+count n=2000/5000); test-consensus 8/8, test-Support 6/6, verify-consensus green. Speedup verified: compare-grids.R. | | C-002 | 2 | DONE | RenumberTips **1.8–9.0×** (30,5000: 0.335→0.037s 9.0×; 50,2000 6.7×; 100,1000 6.3×; 500,300 2.6×; 2000,100 1.8×). Results identical. | [Optimise] `RenumberTips()` on an unlabelled multiPhylo/list now relabels the whole forest in ONE C++ pass (`renumber_tips_to`), replacing the per-tree R loop; no-op pass-through for trees already in target order. Shared by Consensus() and all RenumberTips callers. Returns NULL → existing R fallback for numeric tipOrder / differing label sets / non-phylo elements, so edge-case behaviour is preserved exactly. | **"Tell without looking?"** Only a *labelled* multiPhylo (TipLabel attr) guarantees a shared order without inspecting each tree; otherwise inspection is unavoidable, but now in C++ (and a matching tree is passed through untouched). Gate: full suite GREEN (296); correctness-gate 590; new equivalence test (C++ == per-tree across mixed orderings). LESSON: the existing RenumberTips test caught that the C++ list initially dropped `names` (the lapply path preserves them) — fixed by copying input names onto the result. Bench: drivers/renumber-bench.R. | -| C-004 | 2 | OPEN | — | [Optimise] Consensus() metadata-strip `lapply(c(trees), \\(tr) {tr$edge.length<-NULL; tr$node.label<-NULL; tr})` copies EVERY tree unconditionally; with RenumberTips now fast it is the dominant remaining wrapper cost at high-k/low-n. The C++ core never reads edge.length/node.label. Options: strip only when present; or drop the strip and let RenumberTips' clone carry (bigger clone for trees WITH lengths, none for br=NULL). Consensus()-local, low risk. | renumber-bench.R: at (30,5000) wrapper ≈ 43–63% of Consensus() AFTER the RenumberTips speedup; RenumberTips is now only ~25% of the wrapper, the strip + Preorder + result assembly the rest. | +| C-004 | 2 | DONE | strip→`c(trees)`: forest201.80 3.38→2.54ms (**-25%**), forestMaj 2.60→2.10 (-19%), forest21.260 1.23→1.12 (-9%), forest1k.888 72.1→68.0 (-6%, strict core is 92%). Output byte-identical. | [Optimise] Consensus() metadata-strip replaced by bare `trees <- c(trees)`. The old `lapply(c(trees), \\(tr){tr$edge.length<-NULL; tr$node.label<-NULL; tr})` copied EVERY tree; the consensus core reads only edge+tip.label and the output is rebuilt from scratch, so neither field can change the answer. `c()` alone still materialises a labelled multiPhylo's shared labels onto each tree (the load-bearing job). **Floor finding (key):** the naive presence-guard (`if(!is.null(...))`) was a near-wash — the `is.null` reads cost ~as much as the assignments they skip (0.82→0.75ms), and REGRESSED the metadata-present case; the win comes from skipping the lapply entirely (strip 0.82→c() 0.11ms, ~7x). | gha-bench-diagnose.R: strip was 19–29% of the SMALL forests, 6% of forest1k.888. Equivalence: c004-equiv.R 108 cells (plain/unaligned/edge_len/node_lab/both/labelled/lab_el/list/diff_size × p × check × hash) byte-identical pre/post; core RawMatrix metadata-independent 36 cells; new test-Consensus.R block (edge.length/node.label/labelled, check T/F, p∈{.5,⅔,1}); full suite 4192 PASS; gate 590. | | C-003 | 3 | OPEN | — | [Optimise] hashed: large `unordered_map` churn dominates the LOW-height high-distinct (`rand`) case (deferred materialisation does NOT help there — clusters are tiny). Try a flat open-addressing map / better reserve. Asymptotic floor is Ω(#distinct) ops; only Jansson's O(n) candidate tree removes it. | jansson-bound.R: rand high-n strict ≪ hashed; the gap is hashmap ops, not materialisation. Realistic (concordant) inputs have few distinct splits ⇒ small map ⇒ low priority. | +| C-005 | 3 | OPEN | — | [Optimise] `NTip(trees)` in the Consensus() repeat-loop is now the next-largest wrapper cost after the C-004 strip removal: forest201.80 0.73ms (22% of total), forest1k.888 3.4ms (5%). It builds a full k-vector of tip counts every loop iteration just to test `length(unique(.)) > 1`. Could early-exit / cache / vectorise via the materialised list. Consensus()-local, low risk. | gha-bench-diagnose.R breakdown. | + +## Why the GHA `bench-Consensus.R` cells show NSD after C-001/C-002 (2026-06-03) + +The shipped wins don't appear in the GHA suite because **none of its 5 cells +exercise the regimes the optimisations target** (gha-bench-diagnose.R): + +- All four forests (`as.phylo(int, n)`) are **metadata-free, already in matching + tip order, and high-concordance** (consecutive tree-integers ⇒ few distinct + splits). +- **C-002 (RenumberTips):** 4/5 cells pass `check = FALSE` (partial-matches + `check.labels = FALSE`) ⇒ RenumberTips never runs. The one cell that runs it + (`forest21.260`, the −6.96% blip) is k=21 and already-ordered, so the C++ + path's fixed setup doesn't amortise vs the old R loop — real, but NSD. +- **C-001 (deferred materialisation):** 3/5 cells use default `p = 1` (strict) + ⇒ the single-reference path, which never calls `count_splits_hashed`. The 2 + majority cells (`p = 0.5`) are 1–3 ms and high-concordance ⇒ nothing to defer. +- The strict C++ core dominates the one big cell (`forest1k.888`: 66 of 72 ms). + +To make GHA *demonstrate* the wins, add cells that hit the regimes (OFFERED to +user, not yet added): a high-k unlabelled forest with **shuffled** tip order +(real relabel → C-002), and a tall **low-concordance `p < 1`** forest (→ C-001). +C-004 does move the small cells (strip was 19–29% of them). ## Jansson decision (tracking) diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index a939d90d3..39fb1819c 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -56,6 +56,35 @@ test_that("Consensus()", { expect_equal(Consensus(trees), Preorder(StarTree(letters[1:4]))) }) +test_that("Consensus() ignores edge lengths, node labels, and TipLabel", { + # Consensus() no longer strips edge.length / node.label up front (it coerces + # with c() instead); the core reads only edge + tip.label and the result is + # rebuilt from scratch, so metadata must not change the answer, and a + # *labelled* multiPhylo (shared TipLabel attr, per-tree labels dropped) must + # be handled via c()'s materialisation even when check.labels = FALSE. + base <- list(BalancedTree(8), PectinateTree(8))[c(1, 1, 1, 1, 2, 2, 2)] + base <- structure(RenumberTips(base, base[[1]]), class = "multiPhylo") + withEL <- structure(lapply(base, function(t) { + t[["edge.length"]] <- seq_len(nrow(t[["edge"]])); t + }), class = "multiPhylo") + withNL <- structure(lapply(base, function(t) { + t[["node.label"]] <- paste0("n", seq_len(t[["Nnode"]])); t + }), class = "multiPhylo") + labelled <- ape::.compressTipLabel(base) + expect_true(is.null(.subset2(labelled, 1)[["tip.label"]])) # genuinely labelled + + for (p in c(0.5, 2 / 3, 1)) for (check in c(TRUE, FALSE)) { + ref <- Consensus(base, p, check.labels = check) + expect_equal(Consensus(withEL, p, check.labels = check), ref) + expect_equal(Consensus(withNL, p, check.labels = check), ref) + expect_equal(Consensus(labelled, p, check.labels = check), ref) + } + # Result carries no inherited metadata. + out <- Consensus(withEL, 0.5) + expect_null(out[["edge.length"]]) + expect_null(out[["node.label"]]) +}) + test_that("Consensus() handles large sets of trees", { oneTree <- as.phylo(0, 13) expect_equal( From 35922468dd3b795a85ce2fdceea3b63b563a3b03 Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 09:49:33 +0100 Subject: [PATCH 06/11] Test Consensus() metadata-invariance through the differing-size path The KeepTip repeat-loop (triggered by trees of unequal size) was the one combination not covered by the C-004 metadata-invariance test. Add a case confirming edge.length does not change the (warned) result there. Co-Authored-By: Claude Opus 4.8 --- tests/testthat/test-consensus.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index 39fb1819c..bac1d8118 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -83,6 +83,15 @@ test_that("Consensus() ignores edge lengths, node labels, and TipLabel", { out <- Consensus(withEL, 0.5) expect_null(out[["edge.length"]]) expect_null(out[["node.label"]]) + + # Differing tree sizes exercise the KeepTip repeat-loop; metadata must not + # change its (warned) result either. + mixed <- list(BalancedTree(8), PectinateTree(8), DropTip(BalancedTree(8), 1:2)) + mixedEL <- lapply(mixed, function(t) { + t[["edge.length"]] <- seq_len(nrow(t[["edge"]])); t + }) + expect_equal(suppressWarnings(Consensus(mixedEL, 0.5)), + suppressWarnings(Consensus(mixed, 0.5))) }) test_that("Consensus() handles large sets of trees", { From 50376bee2cc4152e626fca8be8a1fd3e11c45b7d Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 12:52:52 +0100 Subject: [PATCH 07/11] Test p=0.5 tie resolution against ape; document p>0.5 threshold question The even-n tie fixture (2xBalancedTree + 2xPectinateTree, n=4) was only exercised by the "exact == hashed" test, which checks the two internal counters against each other -- it cannot catch an error shared by both. Add ApeTest(tie, 0.5): at p=0.5 a split in exactly n/2 trees is dropped (thresh = floor(n/2)+1), so the two conflicting 50% splits are both dropped, matching ape::consensus. Pins the tie boundary to an oracle. Separately, findings.md records an open question for the maintainer: for p > 0.5 the code keeps count >= floor(n*p)+1 (strictly more than floor(n*p)), whereas the docstring promises "p or more" and ape keeps count >= p*n. They diverge only when n*p is an exact integer (e.g. n=3, p=2/3: a clade in 2/3 of trees is kept by ape, dropped here). Both are safe; the convention/doc-consistency call is the maintainer's. No threshold change made. Repro: dev/profiling/drivers/tie-check.R. Co-Authored-By: Claude Opus 4.8 --- dev/profiling/drivers/tie-check.R | 67 +++++++++++++++++++++++++++++++ dev/profiling/findings.md | 26 ++++++++++++ tests/testthat/test-consensus.R | 10 +++++ 3 files changed, 103 insertions(+) create mode 100644 dev/profiling/drivers/tie-check.R diff --git a/dev/profiling/drivers/tie-check.R b/dev/profiling/drivers/tie-check.R new file mode 100644 index 000000000..f20f59768 --- /dev/null +++ b/dev/profiling/drivers/tie-check.R @@ -0,0 +1,67 @@ +#!/usr/bin/env Rscript +# Settle the p = 0.5 exact-tie question empirically: +# - what does TreeTools::Consensus drop/keep at exactly 50%? +# - does it agree with ape::consensus at the tie boundary? +# - does it agree with the documented "proportion p or more" semantics? +suppressMessages(pkgload::load_all(quiet = TRUE)) # run from the package root + +cat("TreeTools loaded\n\n") + +splitsOf <- function(tr) { + s <- as.Splits(tr) + sort(vapply(as.character(s), identity, character(1))) +} + +report <- function(label, trees, p) { + n <- length(trees) + trees <- RenumberTips(trees, trees[[1]]) + tt <- Consensus(trees, p = p) + ttx <- Consensus(trees, p = p, hash = FALSE) + ap <- ape::consensus(trees, p = p) + thresh <- floor(n * p) + 1 + cat(sprintf("== %s : n=%d, p=%.3f, thresh=floor(n*p)+1=%d (count >= %d to keep)\n", + label, n, p, thresh, thresh)) + cat(sprintf(" TreeTools Nnode (hashed) = %d ; (exact) = %d ; ape Nnode = %d\n", + tt$Nnode, ttx$Nnode, ap$Nnode)) + agreeHashExact <- isTRUE(all.equal(RootTree(tt, 1), RootTree(ttx, 1))) + agreeApe <- isTRUE(all.equal(RootTree(tt, 1), RootTree(ap, 1))) + cat(sprintf(" hashed==exact : %s ; TreeTools==ape : %s\n", + agreeHashExact, agreeApe)) + if (!agreeApe) { + cat(" TreeTools splits:\n"); print(splitsOf(tt)) + cat(" ape splits:\n"); print(splitsOf(ap)) + } + cat("\n") + invisible(list(tt = tt, ape = ap, agreeApe = agreeApe)) +} + +# 1. The canonical even tie: 2 + 2 conflicting topologies (the test-Consensus tie fixture) +tie <- c(rep(list(BalancedTree(8)), 2L), rep(list(PectinateTree(8)), 2L)) +report("tie 2xBal + 2xPec (8 tip)", tie, 0.5) + +# 2. Minimal 2-tree conflict (n = 2, each conflicting split at exactly 50%) +two <- list(ape::read.tree(text = "((a, b), (c, d));"), + ape::read.tree(text = "((a, c), (b, d));")) +report("2 conflicting quartets", two, 0.5) + +# 3. A controlled exact-50% on ONE clade: 4 trees, 2 share clade {t1,t2}, 2 share {t2,t3} +# (rest identical) so exactly one pair of conflicting splits sits at 2/4. +mk <- function(txt) ape::read.tree(text = txt) +ctrl <- list( + mk("(((t1,t2),t3),t4,t5,t6);"), + mk("(((t1,t2),t3),t4,t5,t6);"), + mk("((t1,(t2,t3)),t4,t5,t6);"), + mk("((t1,(t2,t3)),t4,t5,t6);") +) +report("controlled 2-2 clade tie (6 tip)", ctrl, 0.5) + +# 4. Threshold boundary at p = 2/3 with n = 3 (n*p = 2 exactly): is a 2/3 split kept? +# doc 'p or more' => count/n >= 2/3 => count >= 2 keep; impl floor(2)+1=3 => drop. +thr <- list( + mk("(((t1,t2),t3),t4,t5,t6);"), + mk("(((t1,t2),t3),t4,t5,t6);"), + mk("((t1,(t2,t3)),t4,t5,t6);") +) +report("p=2/3, n=3, clade at exactly 2/3", thr, 2 / 3) + +cat("done\n") diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md index 8b2570988..02b494ecb 100644 --- a/dev/profiling/findings.md +++ b/dev/profiling/findings.md @@ -32,6 +32,32 @@ user, not yet added): a high-k unlabelled forest with **shuffled** tip order (real relabel → C-002), and a tall **low-concordance `p < 1`** forest (→ C-001). C-004 does move the small cells (strip was 19–29% of them). +## Threshold semantics at `p > 0.5` — open question for maintainer (2026-06-03) + +Prompted by "how are p = 0.5 ties resolved?". Two distinct findings (repro: +`drivers/tie-check.R`): + +- **p = 0.5 ties are handled correctly.** `thresh = floor(n·p) + 1`, so a split + in exactly n/2 trees (`count = n/2 < thresh`) is **dropped**. Two conflicting + 50% splits are both dropped ⇒ valid tree. Provably correct (any two retained + splits each occur in > n/2 trees ⇒ counts sum > n ⇒ co-occur ⇒ compatible) and + **matches `ape::consensus`** (which bumps `p <- 0.5000001` to the same effect). + Now pinned by `ApeTest(tie, 0.5)` in test-Consensus.R (the even-n tie fixture + was previously only in the hashed==exact test — internal counters only, no + oracle). + +- **p > 0.5 is off-by-one *stricter* than the docstring and `ape`.** The + docstring promises "a proportion `p` or more"; `ape` keeps `count >= p·n` + (`bs >= p * ntree`). TreeTools keeps `count >= floor(n·p) + 1`, i.e. *strictly + more than* `floor(n·p)`. These **diverge only when `n·p` computes to an exact + integer k**: ape keeps `count = k`, TreeTools requires `count >= k+1`. E.g. + n=3, p=2/3: a clade in exactly 2 of 3 trees is kept by ape, dropped by + TreeTools. Both choices are safe (valid tree) and both are FP-fragile at the + boundary. **Convention + doc-consistency decision is the maintainer's:** either + align the code to "p or more" (a tolerance-aware integer threshold, *not* a + copy of ape's fragile `>=`), or correct the docstring to "more than `p`". + Not pre-built. No code/threshold change made. + ## Jansson decision (tracking) Open question: does deterministic Jansson (Maj_Rule_Plus, O(kn)) beat OPTIMISED diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index bac1d8118..38f5389c2 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -47,6 +47,16 @@ test_that("Consensus()", { ApeTest(trees, 0.5) expect_equal(Consensus(trees, 0.999), Consensus(trees, 1)) + # Even-n exact tie: two conflicting splits each in exactly 50% of trees. + # Strict majority keeps a split only when count > n / 2 (thresh = + # floor(n / 2) + 1), so both conflicting splits are dropped and the consensus + # is unresolved there -- always a valid tree. The "exact == hashed" test + # below exercises this fixture too, but only against the other internal + # counter; pin it to an independent oracle (ape) here. + tie <- c(rep(list(BalancedTree(8)), 2L), rep(list(PectinateTree(8)), 2L)) + tie <- RenumberTips(tie, tie[[1]]) + ApeTest(tie, 0.5) + ApeTest(as.phylo(0:2, 8)) ApeTest(as.phylo(0:250, 8)) ApeTest(as.phylo(0:250, 80)) From 6f1ae56a769b65cb23db9b21cbc101cce08b3908 Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 13:12:21 +0100 Subject: [PATCH 08/11] Simplify comment --- R/Consensus.R | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/R/Consensus.R b/R/Consensus.R index 300514e69..f30a9cec7 100644 --- a/R/Consensus.R +++ b/R/Consensus.R @@ -54,15 +54,7 @@ Consensus <- function(trees, p = 1, check.labels = TRUE, hash = TRUE) { stop("Expecting `trees` to be a list.") } - # Coerce to a plain list of standalone trees. c() materialises a labelled - # multiPhylo's shared tip labels onto each tree (the C++ core reads each - # tree's own `tip.label`), exactly as the previous strip-lapply did. We - # deliberately do NOT strip `edge.length` / `node.label`: the consensus core - # reads only `edge` + `tip.label`, and the output tree is rebuilt from - # scratch, so neither field can affect the result. The old per-tree - # `tr[["edge.length"]] <- NULL` forced a copy of every tree for no gain; - # bare c() is ~7x cheaper on metadata-free forests (the common case) and - # output-identical (verified: dev/profiling/drivers/c004-equiv.R, 108 cells). + # c() materialises a labelled multiPhylo's shared tip labels onto each tree. trees <- c(trees) repeat { From 7fcf7d7d9424483f8e74712914cb7a305b6b33b4 Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 13:24:35 +0100 Subject: [PATCH 09/11] rm undue cmt --- tests/testthat/test-consensus.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index 38f5389c2..b588b10ac 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -67,11 +67,6 @@ test_that("Consensus()", { }) test_that("Consensus() ignores edge lengths, node labels, and TipLabel", { - # Consensus() no longer strips edge.length / node.label up front (it coerces - # with c() instead); the core reads only edge + tip.label and the result is - # rebuilt from scratch, so metadata must not change the answer, and a - # *labelled* multiPhylo (shared TipLabel attr, per-tree labels dropped) must - # be handled via c()'s materialisation even when check.labels = FALSE. base <- list(BalancedTree(8), PectinateTree(8))[c(1, 1, 1, 1, 2, 2, 2)] base <- structure(RenumberTips(base, base[[1]]), class = "multiPhylo") withEL <- structure(lapply(base, function(t) { From 5230e4ba6a4ad9862325b0ca755369712b6acc1b Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 13:39:42 +0100 Subject: [PATCH 10/11] Retain splits at exactly proportion p for p>0.5 (match ape and docs) Consensus(trees, p) documented that it retains splits in "a proportion p or more" of trees, but the threshold was floor(n*p)+1 -- one tree too strict, so a split in exactly p*n trees was dropped at integer thresholds (e.g. a split in 2 of 3 trees with p = 2/3). ape::consensus keeps it (count >= p*n). Change the C++ threshold to thresh = max(floor(n/2)+1, ceil(p*n)), with a relative tolerance (1e-9*pn) snapping p*n to an exact integer through floating-point noise -- robust where ape's raw `bs >= p*ntree` is fragile. p = 0.5 is unchanged: the floor(n/2)+1 term keeps it strict (a split must occur in MORE than half the trees) so two conflicting 50% splits can never both be retained and the result stays a valid tree. Pin the boundary with ApeTest(thirds, 2/3) (clade in exactly 2/3 of trees). Update docstring/Rd and NEWS. Gate 590 PASS / 0 ape disagreements; full suite 4195 PASS. Co-Authored-By: Claude Opus 4.8 --- NEWS.md | 9 +++++++++ R/Consensus.R | 18 +++++++++++------- dev/profiling/findings.md | 22 +++++++++++----------- man/Consensus.Rd | 18 +++++++++++------- src/consensus.cpp | 20 ++++++++++++++++++-- tests/testthat/test-consensus.R | 8 ++++++++ 6 files changed, 68 insertions(+), 27 deletions(-) diff --git a/NEWS.md b/NEWS.md index 162f9ef02..cf8e4f240 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,14 @@ # TreeTools 2.4.0.9000 (development) # +## Bug fixes + +- `Consensus(trees, p)` now retains a split present in exactly a proportion `p` + of trees (i.e. in `ceiling(p * length(trees))` trees) for `p > 0.5`, matching + the documentation and `ape::consensus()`; previously such a split was dropped + at exact thresholds (e.g. a split in 2 of 3 trees with `p = 2/3`). The + majority threshold `p = 0.5` is unchanged (a split must occur in more than + half the trees). + ## Performance - Guarantee preorder return from `root_on_node()` to simplify `Consensus()` diff --git a/R/Consensus.R b/R/Consensus.R index f30a9cec7..e78c3363c 100644 --- a/R/Consensus.R +++ b/R/Consensus.R @@ -7,15 +7,19 @@ #' against every other tree in linear time. The majority-rule and threshold #' consensus (`0.5 <= p < 1`) instead count the frequency of every split across #' all trees in a single pass and retain those occurring in a proportion `p` or -#' more of trees; this runs in time linear in the number of trees, after -#' \insertCite{Jansson2016}{TreeTools}. By default the count uses -#' a 128-bit hash, whose results are exact with overwhelming probability; set -#' `hash = FALSE` for a slower but guaranteed-exact count. +#' more of trees (i.e. in at least `ceiling(p * length(trees))` trees); this +#' runs in time linear in the number of trees, after +#' \insertCite{Jansson2016}{TreeTools}. The majority threshold `p = 0.5` is +#' strict: a split is retained only if it occurs in *more* than half the trees, +#' so that two conflicting splits can never both be reported. By default the +#' count uses a 128-bit hash, whose results are exact with overwhelming +#' probability; set `hash = FALSE` for a slower but guaranteed-exact count. #' #' @param trees List of trees, optionally of class `multiPhylo`. -#' @param p Proportion of trees that must contain a split for it to be reported -#' in the consensus. `p = 0.5` gives the majority-rule consensus; `p = 1` (the -#' default) gives the strict consensus. +#' @param p A number from 0.5 to 1 giving the proportion of trees that must +#' contain a split for it to be reported in the consensus: from `p = 0.5` (more +#' than half the trees; the majority-rule consensus) to `p = 1` (every tree; the +#' strict consensus, the default). #' @param check.labels Logical specifying whether to check that all trees have #' identical labels. Defaults to `TRUE`, which is slower. #' @param hash Logical; if `TRUE` (default), majority/threshold consensus diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md index 02b494ecb..e603dc3be 100644 --- a/dev/profiling/findings.md +++ b/dev/profiling/findings.md @@ -46,17 +46,17 @@ Prompted by "how are p = 0.5 ties resolved?". Two distinct findings (repro: was previously only in the hashed==exact test — internal counters only, no oracle). -- **p > 0.5 is off-by-one *stricter* than the docstring and `ape`.** The - docstring promises "a proportion `p` or more"; `ape` keeps `count >= p·n` - (`bs >= p * ntree`). TreeTools keeps `count >= floor(n·p) + 1`, i.e. *strictly - more than* `floor(n·p)`. These **diverge only when `n·p` computes to an exact - integer k**: ape keeps `count = k`, TreeTools requires `count >= k+1`. E.g. - n=3, p=2/3: a clade in exactly 2 of 3 trees is kept by ape, dropped by - TreeTools. Both choices are safe (valid tree) and both are FP-fragile at the - boundary. **Convention + doc-consistency decision is the maintainer's:** either - align the code to "p or more" (a tolerance-aware integer threshold, *not* a - copy of ape's fragile `>=`), or correct the docstring to "more than `p`". - Not pre-built. No code/threshold change made. +- **p > 0.5 was off-by-one *stricter* than the docstring and `ape` — FIXED.** + The docstring promises "a proportion `p` or more"; `ape` keeps `count >= p·n`. + The old code kept `count >= floor(n·p) + 1`, diverging when `n·p` is an exact + integer k (ape keeps `count = k`; old code required `count >= k+1`; e.g. n=3, + p=2/3 dropped a clade in 2 of 3 trees). **Maintainer chose: align code to + "p or more".** consensus.cpp now uses `thresh = max(floor(n/2)+1, ceil(p·n))` + with a relative tolerance (`1e-9·pn`) snapping `p·n` to an exact integer + through FP noise — robust where ape's raw `>=` is fragile. p = 0.5 unchanged + (strict majority, via the `floor(n/2)+1` floor). Pinned by `ApeTest(thirds, + 2/3)` (n=3 boundary). Docstring + NEWS updated. Gate 590 PASS / 0 ape + disagreements; full suite 4195 PASS. ## Jansson decision (tracking) diff --git a/man/Consensus.Rd b/man/Consensus.Rd index 560ec9c71..911129ab7 100644 --- a/man/Consensus.Rd +++ b/man/Consensus.Rd @@ -9,9 +9,10 @@ Consensus(trees, p = 1, check.labels = TRUE, hash = TRUE) \arguments{ \item{trees}{List of trees, optionally of class \code{multiPhylo}.} -\item{p}{Proportion of trees that must contain a split for it to be reported -in the consensus. \code{p = 0.5} gives the majority-rule consensus; \code{p = 1} (the -default) gives the strict consensus.} +\item{p}{A number from 0.5 to 1 giving the proportion of trees that must +contain a split for it to be reported in the consensus: from \code{p = 0.5} (more +than half the trees; the majority-rule consensus) to \code{p = 1} (every tree; the +strict consensus, the default).} \item{check.labels}{Logical specifying whether to check that all trees have identical labels. Defaults to \code{TRUE}, which is slower.} @@ -35,10 +36,13 @@ The strict consensus (\code{p = 1}) compares the clusters of the first tree against every other tree in linear time. The majority-rule and threshold consensus (\verb{0.5 <= p < 1}) instead count the frequency of every split across all trees in a single pass and retain those occurring in a proportion \code{p} or -more of trees; this runs in time linear in the number of trees, after -\insertCite{Jansson2016}{TreeTools}. By default the count uses -a 128-bit hash, whose results are exact with overwhelming probability; set -\code{hash = FALSE} for a slower but guaranteed-exact count. +more of trees (i.e. in at least \code{ceiling(p * length(trees))} trees); this +runs in time linear in the number of trees, after +\insertCite{Jansson2016}{TreeTools}. The majority threshold \code{p = 0.5} is +strict: a split is retained only if it occurs in \emph{more} than half the trees, +so that two conflicting splits can never both be reported. By default the +count uses a 128-bit hash, whose results are exact with overwhelming +probability; set \code{hash = FALSE} for a slower but guaranteed-exact count. } \examples{ Consensus(as.phylo(0:2, 8)) diff --git a/src/consensus.cpp b/src/consensus.cpp index 309701bfd..61617af40 100644 --- a/src/consensus.cpp +++ b/src/consensus.cpp @@ -8,6 +8,7 @@ using namespace Rcpp; #include /* for fill */ #include /* for array */ #include /* for steady_clock (interrupt timing) */ +#include /* for ceil, round, fabs (threshold) */ #include /* for uint64_t (split hashing) */ #include /* for string (hash key) */ #include /* for unordered_map */ @@ -164,8 +165,23 @@ RawMatrix calc_consensus_tree( StackContainer& S ) { const int32 n_trees = trees.length(); - const int32 frac_thresh = int32(n_trees * p[0]) + 1; - const int32 thresh = frac_thresh > n_trees ? n_trees : frac_thresh; + // Retain a split iff it occurs in a proportion `p` or more of trees, i.e. + // count >= ceil(p * n_trees) -- the documented semantics, matching + // ape::consensus(). A small relative tolerance snaps p * n_trees back to an + // exact integer when floating-point rounding nudges it (e.g. 3 * (2/3) is not + // exactly 2), so a split in exactly p * n trees is kept, not dropped by an + // off-by-one. The majority threshold p = 0.5 is strict (count > n_trees / 2): + // two conflicting splits each in exactly half the trees must not both survive, + // else the retained splits would not be pairwise compatible. + const double pn = double(n_trees) * p[0]; + const double pn_round = std::round(pn); + const int32 ceil_pn = + (std::fabs(pn - pn_round) < 1e-9 * (pn > 1.0 ? pn : 1.0)) + ? int32(pn_round) + : int32(std::ceil(pn)); + const int32 strict_floor = n_trees / 2 + 1; // floor(n / 2) + 1 + const int32 want = ceil_pn > strict_floor ? ceil_pn : strict_floor; + const int32 thresh = want > n_trees ? n_trees : want; std::vector tables; tables.reserve(n_trees); diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index b588b10ac..5c08a891b 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -57,6 +57,14 @@ test_that("Consensus()", { tie <- RenumberTips(tie, tie[[1]]) ApeTest(tie, 0.5) + # p > 0.5 boundary: a split in exactly p * n trees is retained ("p or more"), + # matching ape. Clade {t1, t2} occurs in 2 of 3 trees (= 2/3 exactly). + thirds <- lapply(c("(((t1, t2), t3), t4, t5, t6);", + "(((t1, t2), t3), t4, t5, t6);", + "((t1, (t2, t3)), t4, t5, t6);"), + function(x) ape::read.tree(text = x)) + ApeTest(thirds, 2 / 3) + ApeTest(as.phylo(0:2, 8)) ApeTest(as.phylo(0:250, 8)) ApeTest(as.phylo(0:250, 80)) From 07c9ee2e0e3ae757730b86d03a69331bbdbac4be Mon Sep 17 00:00:00 2001 From: R script Date: Wed, 3 Jun 2026 15:02:56 +0100 Subject: [PATCH 11/11] Skip scale-only consensus tests under valgrind The mem-check (valgrind) job stalls for many minutes in test-consensus.R: the 100k-leaf heap-limit build, the 33k-tip consensus, and two 33k-tree consensus runs dominate runtime under valgrind's ~30x slowdown while exercising no memory path the smaller cases miss. Gate them behind a new TESTING_MEMCHECK env var (set in memcheck.yml). TESTING_MEMCHECK is deliberately separate from USING_ASAN: the latter suppresses deliberate-bad-input error tests that ASan aborts on but valgrind tolerates, so setting USING_ASAN here would silently disable those error-path tests under valgrind. Full-scale consensus tests still run in uninstrumented CI. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/memcheck.yml | 6 ++++++ tests/testthat/test-consensus.R | 11 ++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/memcheck.yml b/.github/workflows/memcheck.yml index a7df3ff3c..08b1ec0f6 100644 --- a/.github/workflows/memcheck.yml +++ b/.github/workflows/memcheck.yml @@ -47,6 +47,12 @@ jobs: RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} ASAN_OPTIONS: verify_asan_link_order=0 + # Signals to the test suite that it is running under valgrind, so heavy + # scale-only tests can be skipped: valgrind's ~30x slowdown makes them the + # dominant cost while adding no memory-bug coverage beyond the small cases. + # Distinct from USING_ASAN, which suppresses deliberate-bad-input error + # tests that ASan (but not valgrind) aborts on. + TESTING_MEMCHECK: true steps: - uses: ms609/actions/memcheck@main diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R index 5c08a891b..807bfda21 100644 --- a/tests/testthat/test-consensus.R +++ b/tests/testthat/test-consensus.R @@ -17,12 +17,17 @@ test_that("Consensus() errors", { expect_equal(Consensus(c(halfTree)), halfTree) expect_equal(Consensus(list(halfTree)), halfTree) + # Building 100k-leaf trees and a 33k-tip consensus dominates valgrind runtime + # (~30x slowdown) but exercises no memory path the smaller cases above miss; + # full scale still runs in uninstrumented CI. + skip_if(Sys.getenv("TESTING_MEMCHECK") != "") + # Test that trees larger than heap limit fail gracefully expect_error( Consensus(c(BalancedTree(100001), PectinateTree(100001))), "too many leaves.*100000" ) - + largeTree <- BalancedTree(33333) consensus_large <- Consensus(c(largeTree, largeTree)) expect_equal(NTip(consensus_large), 33333) @@ -108,6 +113,10 @@ test_that("Consensus() ignores edge lengths, node labels, and TipLabel", { }) test_that("Consensus() handles large sets of trees", { + # 33k+ trees (past int16_max) is prohibitively slow under valgrind for no + # extra memory coverage: the hashed counter is exercised by the smaller + # forests above. Runs at full scale in uninstrumented CI. + skip_if(Sys.getenv("TESTING_MEMCHECK") != "") oneTree <- as.phylo(0, 13) expect_equal( Consensus(lapply(1:33000, function(x) oneTree)),