diff --git a/.github/workflows/memcheck.yml b/.github/workflows/memcheck.yml
index a7df3ff3c..08b1ec0f6 100644
--- a/.github/workflows/memcheck.yml
+++ b/.github/workflows/memcheck.yml
@@ -47,6 +47,12 @@ jobs:
RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
ASAN_OPTIONS: verify_asan_link_order=0
+ # Signals to the test suite that it is running under valgrind, so heavy
+ # scale-only tests can be skipped: valgrind's ~30x slowdown makes them the
+ # dominant cost while adding no memory-bug coverage beyond the small cases.
+ # Distinct from USING_ASAN, which suppresses deliberate-bad-input error
+ # tests that ASan (but not valgrind) aborts on.
+ TESTING_MEMCHECK: true
steps:
- uses: ms609/actions/memcheck@main
diff --git a/.gitignore b/.gitignore
index 83d996656..02c39d70d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,11 @@ vignettes/*_cache*
/*-benchmark-results
*.bench.Rds
.positai
+
+# /profile skill artefacts (large; rebuilt each round)
+dev/profiling/.vtune-lib-*/
+dev/profiling/.libpath.txt
+dev/profiling/result_*/
+dev/profiling/drivers/*-profvis.html
+dev/profiling/*.csv
+dev/profiling/*.rds
diff --git a/NEWS.md b/NEWS.md
index 42b539778..cf8e4f240 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,30 @@
# TreeTools 2.4.0.9000 (development) #
+## Bug fixes
+
+- `Consensus(trees, p)` now retains a split present in exactly a proportion `p`
+ of trees (i.e. in `ceiling(p * length(trees))` trees) for `p > 0.5`, matching
+ the documentation and `ape::consensus()`; previously such a split was dropped
+ at exact thresholds (e.g. a split in 2 of 3 trees with `p = 2/3`). The
+ majority threshold `p = 0.5` is unchanged (a split must occur in more than
+ half the trees).
+
## Performance
- Guarantee preorder return from `root_on_node()` to simplify `Consensus()`
internal pre-processing
+- `Consensus()` and `SplitFrequency()` defer materialising a split's bit pattern
+ until it is needed, so splits that never reach the consensus threshold are no
+ longer built. Identical results; up to ~13× faster for large trees (greatest
+ gains for tall trees / many tips), with no change at small sizes.
+- `RenumberTips()` relabels an unlabelled `multiPhylo` or `list` of trees in a
+ single C++ pass instead of a per-tree R loop, with a no-op fast path for trees
+ already in the target order. Speeds up `Consensus()` and other callers when
+ combining many trees; results are unchanged.
+- `Consensus()` no longer copies every input tree to strip branch lengths and
+ node labels (the consensus core ignores both); it now coerces in place,
+ trimming wrapper overhead (~25% faster on small forests of many short trees).
+ Results are unchanged.
# TreeTools 2.4.0 (2026-06-02) #
diff --git a/R/Consensus.R b/R/Consensus.R
index 484a7dc23..e78c3363c 100644
--- a/R/Consensus.R
+++ b/R/Consensus.R
@@ -7,15 +7,19 @@
#' against every other tree in linear time. The majority-rule and threshold
#' consensus (`0.5 <= p < 1`) instead count the frequency of every split across
#' all trees in a single pass and retain those occurring in a proportion `p` or
-#' more of trees; this runs in time linear in the number of trees, after
-#' \insertCite{Jansson2016}{TreeTools}. By default the count uses
-#' a 128-bit hash, whose results are exact with overwhelming probability; set
-#' `hash = FALSE` for a slower but guaranteed-exact count.
+#' more of trees (i.e. in at least `ceiling(p * length(trees))` trees); this
+#' runs in time linear in the number of trees, after
+#' \insertCite{Jansson2016}{TreeTools}. The majority threshold `p = 0.5` is
+#' strict: a split is retained only if it occurs in *more* than half the trees,
+#' so that two conflicting splits can never both be reported. By default the
+#' count uses a 128-bit hash, whose results are exact with overwhelming
+#' probability; set `hash = FALSE` for a slower but guaranteed-exact count.
#'
#' @param trees List of trees, optionally of class `multiPhylo`.
-#' @param p Proportion of trees that must contain a split for it to be reported
-#' in the consensus. `p = 0.5` gives the majority-rule consensus; `p = 1` (the
-#' default) gives the strict consensus.
+#' @param p A number from 0.5 to 1 giving the proportion of trees that must
+#' contain a split for it to be reported in the consensus: from `p = 0.5` (more
+#' than half the trees; the majority-rule consensus) to `p = 1` (every tree; the
+#' strict consensus, the default).
#' @param check.labels Logical specifying whether to check that all trees have
#' identical labels. Defaults to `TRUE`, which is slower.
#' @param hash Logical; if `TRUE` (default), majority/threshold consensus
@@ -54,12 +58,8 @@ Consensus <- function(trees, p = 1, check.labels = TRUE, hash = TRUE) {
stop("Expecting `trees` to be a list.")
}
- # Remove irrelevant metadata so we don't waste time processing it
- trees <- lapply(c(trees), function(tr) {
- tr[["edge.length"]] <- NULL
- tr[["node.label"]] <- NULL
- tr
- })
+ # c() materialises a labelled multiPhylo's shared tip labels onto each tree.
+ trees <- c(trees)
repeat {
nTip <- NTip(trees)
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 54c14a345..243e2273e 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -101,6 +101,10 @@ renumber_tips_batch <- function(trees, perm, n_tip, new_labels) {
.Call(`_TreeTools_renumber_tips_batch`, trees, perm, n_tip, new_labels)
}
+renumber_tips_to <- function(trees, target) {
+ .Call(`_TreeTools_renumber_tips_to`, trees, target)
+}
+
cpp_edge_to_splits <- function(edge, order, nTip) {
.Call(`_TreeTools_cpp_edge_to_splits`, edge, order, nTip)
}
diff --git a/R/tree_numbering.R b/R/tree_numbering.R
index ff6cdc2c5..dd6aad23b 100644
--- a/R/tree_numbering.R
+++ b/R/tree_numbering.R
@@ -601,6 +601,25 @@ TntOrder.NULL <- function(tree) NULL
#' @export
RenumberTips <- function(tree, tipOrder) UseMethod("RenumberTips")
+# Fast path shared by the unlabelled multiPhylo and list methods: relabel an
+# entire forest to a single target order in one C++ call, instead of dispatching
+# RenumberTips.phylo per tree in R. Returns the relabelled list, or NULL to
+# signal that the caller should fall back to the per-tree R path (the C++ helper
+# declines anything it would not handle identically: see renumber_tips_to()).
+.RenumberForestToC <- function(tree, tipOrder) {
+ if (is.numeric(tipOrder)) {
+ # Numeric tipOrder indexes each tree's OWN labels, so the target differs per
+ # tree; only the R path handles that.
+ return(NULL)
+ }
+ newOrder <- TipLabels(tipOrder, single = TRUE)
+ if (anyDuplicated(newOrder)) {
+ # Let RenumberTips.phylo raise its "repeated in tipOrder" error.
+ return(NULL)
+ }
+ renumber_tips_to(tree, newOrder)
+}
+
#' @rdname RenumberTips
#' @export
RenumberTips.phylo <- function(tree, tipOrder) {
@@ -667,9 +686,16 @@ RenumberTips.multiPhylo <- function(tree, tipOrder) {
labelled <- !is.null(at[["TipLabel"]])
if (!labelled) {
- # Unlabelled: each tree may have different tip orderings,
- # so must process individually
- tree <- lapply(tree, RenumberTips.phylo, tipOrder)
+ # Unlabelled: each tree may have its own tip ordering. Relabel the whole
+ # forest in one C++ pass; fall back to the per-tree R path for inputs the
+ # fast path declines (numeric tipOrder, differing label sets, non-phylo
+ # elements, ...).
+ relabelled <- .RenumberForestToC(tree, tipOrder)
+ tree <- if (is.null(relabelled)) {
+ lapply(tree, RenumberTips.phylo, tipOrder)
+ } else {
+ relabelled
+ }
attributes(tree) <- at
return(tree)
}
@@ -726,7 +752,8 @@ RenumberTips.multiPhylo <- function(tree, tipOrder) {
#' @rdname RenumberTips
#' @export
RenumberTips.list <- function(tree, tipOrder) {
- lapply(tree, RenumberTips, tipOrder)
+ relabelled <- .RenumberForestToC(tree, tipOrder)
+ if (is.null(relabelled)) lapply(tree, RenumberTips, tipOrder) else relabelled
}
#' @rdname RenumberTips
diff --git a/dev/profiling/DECISION.md b/dev/profiling/DECISION.md
new file mode 100644
index 000000000..9772c8c05
--- /dev/null
+++ b/dev/profiling/DECISION.md
@@ -0,0 +1,100 @@
+# Consensus: profiling outcome & the Jansson decision
+
+**For MRS — read this first.** You asked me to implement Jansson's algorithm
+alongside the current O(kn) consensus, optimise both, and let data pick a winner.
+
+## TL;DR
+
+1. **I did NOT implement the full Jansson (Maj_Rule_Plus) algorithm.** The data
+ say it cannot beat the (now-optimised) hashed counter in any realistic,
+ time-meaningful regime, and it is high-risk to get right (the authors' own
+ reference impl, FACT, ships **broken** majority code). The exact algorithm and
+ what a future implementation would need are written down below so it can be
+ built on request.
+2. **I optimised the existing hashed counter** (deferred split materialisation):
+ **up to 13× faster** at high n (tall trees), median **1.23×**, with **zero
+ change to results** (split sets identical to the deterministic `exact` path,
+ verified at n up to 3000). This is the real, shipping win.
+3. **hashed stays the default; `exact` stays the deterministic fallback.** No
+ crossover that warrants switching algorithms; one approach (hashed) wins
+ across the board — so, per your "avoid redundant code" steer, no third path.
+
+## Why not Jansson — the data
+
+Jansson's deterministic O(kn) majority algorithm exists to **count cluster
+frequencies without hashing**, by matching each input tree against an evolving
+O(n)-cluster candidate tree (Day's algorithm + `One_Way_Compatible` +
+`Merge_Trees` + delete/insert). Its only advantage over the current code is
+*determinism* (no ~1e-30 hash-collision risk — which you've explicitly waived)
+and avoiding `exact`'s O(k·n·h) on tall trees.
+
+**Measured lower bound (rigorous):** Jansson ≥ 2× the strict-consensus path
+(Phase 2 ≈ strict's k Day-matchings; Phase 1 ≥ Phase 2, adding
+One_Way_Compatible + Merge_Trees + per-iteration table rebuilds). Strict's inner
+loop is *tighter* than Jansson's, so this under-counts Jansson → the verdict is
+conservative. Where `2×strict ≥ hashed`, **Jansson provably loses**. Results vs
+the optimised hashed (`drivers/jansson-bound.R`):
+
+| regime | concordance | 2×strict / hashed | verdict |
+|--------|-------------|-------------------|---------|
+| high-k/low-n (all) | any | 1.1–1.8 | **Jansson loses (proven)** |
+| high-k/high-n (2000,500) | concordant / moderate | 1.24 / 1.54 | **loses (proven)** |
+| high-k/high-n (1000,1000) | concordant / moderate | 1.32 / 1.42 | **loses (proven)** |
+| low-k/high-n (5000,10) | concordant / moderate | 0.99 / 1.09 | loses / borderline |
+| low-k/high-n (10000,5) | any | 0.45–1.07 | not proven — but <60 ms absolute |
+| high-k/high-n (≥2000,≥500) | **extreme** (rand/tall) | 0.44–0.56 | not proven |
+
+**The only cells where Jansson is not proven to lose are** (a) sub-60 ms
+absolute times (low-k/very-high-n — a 2× gap is <30 ms, irrelevant), or
+(b) **extreme-conflict** input (≈ independent random trees), whose majority
+consensus is a near-empty star — not something anyone computes. Realistic
+consensus inputs (bootstrap / Bayesian / MPT sets, even conflicting gene-tree
+sets) are concordant-to-moderate, and there Jansson provably loses.
+
+This is consistent with the authors' own C++ `Maj_Rule_Plus` timings (~10×
+slower than TreeTools' hashed at small n).
+
+**Net:** building, proving, and CRAN-hardening `One_Way_Compatible` +
+`Merge_Trees` + a dynamic delete/insert tree — the exact machinery FACT got
+wrong — to maybe win an unrealistic corner by a few ms is a bad trade against
+"correctness paramount / avoid redundant code". **Re-openable**: if you want the
+deterministic-O(kn) guarantee regardless, the algorithm is
+Maj_Rule_Plus Phase 1 (Fig. 2 of arXiv:1307.7821) + a standard-majority Phase 2
+(keep `count > k·p` instead of `K(v) > Q(v)`); subroutine specs in
+PLAN-consensus.md. Say the word.
+
+## The optimisation that shipped (C-001)
+
+`count_splits_hashed` no longer materialises every distinct split's bit pattern
+eagerly. Each distinct split keeps a 12-byte witness `(tree, L, R)`; the packed
+pattern is rebuilt only for splits that reach the consensus threshold (or all,
+for `SplitFrequency`). At high n the wasted materialisation of non-surviving
+splits was the dominant cost. Verified speedups (min of reps,
+`drivers/compare-grids.R`): tall(10000,5) **×13.0**, tall(5000,10) ×8.6,
+rand(10000,5) ×2.5, high-k/high-n ×1.5–2.9, median ×1.23. **Results identical to
+the deterministic `exact` path in every cell.** Both rewired paths are gated at
+shipping scale: `correctness-gate.R` (590 checks) verifies consensus split SETS
+at n≤3000 and `SplitFrequency` split sets AND counts at n=2000/5000 (hashed ==
+exact); package `test-consensus.R` (8/8) and `test-Support.R` (6/6) pass;
+`verify-consensus.R` green.
+
+## What else the profiling found (not yet actioned)
+
+- **C-002 (open):** at **high-k/low-n** the R wrapper is **57%** of `Consensus()`
+ wall time; `RenumberTips` is 54% of that. A safe fast-path (batch C++ relabel
+ / skip when labels already consistent) is the next throughput win in that
+ regime. Touches shared code → needs the full test suite as a gate. Deferred to
+ avoid shipping a risky shared-code change while you're away.
+- **C-003 (low priority):** hashed's `unordered_map` churn dominates the
+ low-height extreme-conflict case; only matters for degenerate inputs.
+- **Threshold convention (FYI, unchanged):** for 0.5
k·p`; ape keeps `>= k·p`; roxygen says "p or more". Flagging — your call.
+
+## Correctness fixes made en route
+
+- `dev/red-team/verify-consensus.R` was **dead**: it called
+ `consensus_tree(..., hash=FALSE)` but the arg is `exact` → "unused argument".
+ Fixed (`exact = TRUE`); now green (0 failures).
+- Built a method-pluggable gate (`correctness-gate.R`) — hashed==exact==ape@0.5,
+ 588 checks. (Found & fixed a `which.min()` bug in it that had been
+ silently skipping every ape comparison.)
diff --git a/dev/profiling/PLAN-consensus.md b/dev/profiling/PLAN-consensus.md
new file mode 100644
index 000000000..08bfa9bb4
--- /dev/null
+++ b/dev/profiling/PLAN-consensus.md
@@ -0,0 +1,121 @@
+# Consensus optimisation & Jansson comparison — plan
+
+Date opened: 2026-06-02. Author: Claude (Opus 4.8), at MRS's request (user unavailable; no plan approval possible — proceeding after advisor consult).
+
+## Goal (user's words)
+
+> Aggressively /profile the new Consensus implementation. Fast throughput at
+> high-k/low-n, high-n/low-k, and high-n/high-k. We have an O(kn) impl that
+> doesn't mirror Jansson's; implement Jansson's algo in parallel; optimise both;
+> use data to decide which wins where. Crossover ⇒ switch near it; one [almost]
+> always wins (within a couple %) ⇒ keep just that, drop redundant code.
+> Correctness paramount: **no systematic error**. Micro-probability of split
+> collision is acceptable.
+
+## What the code does today (`src/consensus.cpp`, `R/Consensus.R`)
+
+- **Strict (p = 1)**: single-reference O(kn) pass over tree 0 (`CLUSTONL/R` +
+ contiguity test). Already optimal. *Out of scope.*
+- **Majority / threshold (0.5 ≤ p < 1)**: count every split's frequency in one
+ pass, keep those with `count >= thresh` where
+ `thresh = min(floor(k·p)+1, k)` (i.e. strictly > k·p). Correct *by
+ construction*: every kept split occurs in > k/2 trees ⇒ pairwise- (hence
+ globally-) compatible ⇒ `splits_to_edge` forms a tree with no merge step.
+ Two counting cores share one postorder primitive (`for_each_internal_node`):
+ - `count_splits_hashed` — **default**. O(kn). 128-bit splitmix64 sum-hash of
+ leaf ids, accumulated incrementally up the tree. Probabilistic (≈1e-30
+ collision). Tiny constant: one postorder pass, O(1)/node.
+ - `count_splits_exact` — `hash = FALSE`. O(k·n·h). Builds an exact packed
+ leaf-set bitmask key by iterating L..R per cluster (cost = Σ cluster sizes =
+ O(n·h)). Deterministic. Slow for tall trees (h ~ n).
+
+## Jansson's algorithm (arXiv 1307.7821 = SODA/RECOMB 2013; J.ACM 2016)
+
+- Deterministic O(kn) **standard majority** lives in ref [18] (RECOMB 2013).
+ The arXiv paper's `Maj_Rule_Plus` (Fig. 2) is the same Phase-1 candidate
+ generation; only the keep-test differs.
+- **Phase 1** (candidate generation, ⊇ all majority clusters): maintain a
+ candidate *tree* T (start = T_1, all counts 1). For each later tree T_j:
+ Boyer–Moore-style update — `+1` if a candidate cluster occurs in T_j, `-1` if
+ incompatible with T_j, unchanged if compatible-but-absent; delete count-0
+ nodes (top-down); then insert T_j's clusters that are compatible with T but
+ absent. Uses **Day's algorithm** (= our `ClusterTable`, O(1) "occurs in?"
+ after O(n) prep), **`One_Way_Compatible`** and **`Merge_Trees`** (ref [17],
+ SODA/J.ACM — still need to fetch their exact procedures).
+- **Phase 2 (standard majority)**: recount K(v) = #trees each surviving
+ candidate occurs in; keep iff K(v) > k·p (same threshold as today). (The
+ arXiv paper's Maj+ Phase 2 instead keeps K(v) > Q(v); we do **not** want
+ Maj+ — it is a *different, more resolved* tree and would be a systematic
+ error vs `ape::consensus`.)
+
+Derivation soundness: majority ⊆ majority(+), so the Phase-1 candidate set
+(⊇ all Maj+ clusters, Lemma 4) ⊇ all majority clusters for any p ≥ 0.5. Phase 2
+keeps exactly the > k·p subset; survivors live in tree T so are mutually
+compatible ⇒ valid tree. ✓
+
+## Why this is subtle / the key prior
+
+- **FACT (Jansson's own C++ reference impl) majority code is BROKEN** (verified
+ by build 2026-05; see memory `fact-majority-broken.md` + this dir). Crashes /
+ drops majority splits. ⇒ Implement from the **paper with proofs**, never port
+ FACT. The bug is in FACT's *code*, not Jansson's *algorithm*.
+- **The current method is already O(kn) (hashed) and correct-by-construction**,
+ needing no merge — a genuine simplification that Jansson does not have.
+- **Early evidence hashed dominates**: the paper's own C++ `Maj_Rule_Plus`
+ timings (their 2.2 GHz box) are ≈10× slower than TreeTools' historical hashed
+ numbers (e.g. (k1000,n100): FACT 1.29 s vs TreeTools ≈0.13 s). Jansson does
+ several O(n) passes/tree (Day prep, 2× One_Way_Compatible, Merge_Trees, BST);
+ hashed does one O(1)/node pass. So Jansson's realistic edge is **determinism**
+ + **beating `exact` on tall trees** (O(kn) vs O(knh)), NOT beating hashed.
+
+## Framing the comparison
+
+Jansson competes with **`exact`** (the deterministic path), not with `hashed`.
+- vs `hashed`: user has waived collision risk; Jansson must also be *faster* to
+ win, which the analysis says it won't be. Expect hashed = default everywhere.
+- vs `exact`: Jansson O(kn) should beat exact O(knh) for **tall, high-n** trees;
+ exact's smaller constant may win for **short/balanced or small** trees. A real
+ crossover may exist *within the deterministic option*.
+
+## Regimes & covariates to benchmark
+
+(k, n): high-k/low-n (k≈1–5k, n≈20–50); low-k/high-n (k≈5–20, n≈2–10k);
+high-k/high-n (k≈500–2k, n≈1–5k). **Covariates that matter as much as (k,n):**
+- **Tree shape / height h**: balanced (h~log n) vs pectinate (h~n). Drives
+ exact & Jansson. *Must vary.* (Existing benches use random n=100 only.)
+- **Concordance**: identical / highly-concordant / random-independent. Drives
+ #distinct splits (hashmap size; Jansson insert-delete churn).
+- **p**: 0.5 and 0.75.
+
+## Phases
+
+- **P0 — Scaffold + harness.** dev/profiling/ skeleton; deterministic
+ cross-regime benchmark grid (hashed vs exact first); upgrade
+ verify-consensus.R into the correctness gate (hashed == exact == ape across a
+ big grid + adversarial fixtures). Every new method must pass this gate.
+- **P1 — Profile & optimise existing.** profvis the R wrapper (RenumberTips,
+ Preorder, metadata strip, splits_to_edge, RootTree — likely dominate at
+ low-n/high-k); VTune the C++ core. Verify each fix with std::chrono micro-bench.
+ Prime candidates: incremental bitmask-OR for `exact` (O(knh/W)); ClusterTable
+ ctor cost (k constructions); hashmap reserve/flat-map for hashed; trim R wrapper.
+- **P2 — Implement Jansson.** Fetch [17] for One_Way_Compatible/Merge_Trees (or
+ reconstruct + prove). Add internal third path. Gate through the harness — must
+ equal hashed/exact/ape on the full grid + adversarial fixtures.
+- **P3 — Benchmark all three across grid; analyse crossovers.** Data table +
+ plots; identify winners per cell.
+- **P4 — Integrate decision + document.** Wire dispatch (expected: hashed
+ default; deterministic fallback = winner of {exact, Jansson}, maybe
+ shape-switched). Update NEWS, roxygen, memory note.
+
+## Open questions for advisor
+
+1. Scope: full Jansson is a large, error-prone C++ effort whose likely payoff
+ (per the paper's own timings) is "confirm hashed wins." Stage it behind the
+ harness + exact-optimisation + benchmark, and make *shipping* it conditional
+ on data showing exact is a real deterministic-path bottleneck? (Still
+ implement+verify it to GET the data, per the explicit request.)
+2. Agree Jansson competes with `exact`, not `hashed`?
+3. Any correctness trap in the standard-majority Phase-2 derivation above?
+4. Anything in the benchmark design that would let a wrong conclusion slip
+ through (e.g. measuring only random trees, or core-only timing hiding R
+ overhead)?
diff --git a/dev/profiling/baselines.md b/dev/profiling/baselines.md
new file mode 100644
index 000000000..28883a797
--- /dev/null
+++ b/dev/profiling/baselines.md
@@ -0,0 +1,52 @@
+# Consensus baselines (installed -O2 -g build; R 4.7.0; this machine)
+
+Core timing = `TreeTools:::consensus_tree(forest, p=0.5)` on RenumberTips+Preorder'd
+forests (min of 4 reps after warmup). Full grid in `results-grid.csv`. Driver:
+`drivers/consensus-grid.R`. Run 1: 2026-06-02.
+
+## Headline: hashed dominates exact in EVERY cell (1.1x–3x). No crossover.
+
+| regime (n,k) | scenario | hashed min(s) | exact min(s) | exact/hashed |
+|---------------------|-----------|---------------|--------------|--------------|
+| highK_lowN (30,5000)| rand | 0.0647 | 0.0723 | 1.12 |
+| highK_lowN (50,2000)| tall | 0.0450 | 0.0495 | 1.10 |
+| lowK_highN (10000,5)| rand | 0.0422 | 0.1269 | 3.01 |
+| lowK_highN (10000,5)| tall | 0.1885 | 0.2885 | 1.53 |
+| lowK_highN (5000,10)| rand | 0.0291 | 0.0743 | 2.55 |
+| highK_highN(2000,500)| rand | 0.6648 | 1.2550 | 1.89 |
+| highK_highN(2000,500)| tall | 1.2358 | 1.9502 | 1.58 |
+| highK_highN(1000,1000)| concord70| 0.2308 | 0.3917 | 1.70 |
+
+## Cost decomposition (what actually dominates)
+
+- **R-wrapper overhead** (`Consensus()` end-to-end minus core), by regime:
+ | (n,k) | Consensus() | core | wrapper | wrapper % |
+ |--------------|-------------|--------|---------|-----------|
+ | (30, 5000) | 0.1480 | 0.0636 | 0.0844 | **57%** |
+ | (100, 100) | 0.0061 | 0.0038 | 0.0023 | 37% |
+ | (2000, 10) | 0.0112 | 0.0086 | 0.0026 | 23% |
+ | (1000, 1000) | 0.6081 | 0.5814 | 0.0267 | 4% |
+ ⇒ At **high-k/low-n** the R wrapper (RenumberTips / metadata strip / Preorder)
+ is the bottleneck, NOT the counter. This is a named target regime.
+
+- **ClusterTable construction** dominates at high-n: tall-vs-rand gap at fixed
+ (n,k) for hashed (e.g. n=10000,k=5: rand 0.042 vs tall 0.189) reflects the
+ O(n·h) split-materialisation + construction work shared by both counters; the
+ hashed/exact gap is secondary to this shared floor.
+
+## Implication for Jansson
+
+Jansson (deterministic O(kn)) competes with `exact` (deterministic), not hashed.
+But (1) hashed already beats exact everywhere by ≤3x; (2) exact never blows up
+(shared construction floor dominates); (3) Jansson structurally does ~2k
+ClusterTable constructions + per-tree One_Way_Compatible + Merge_Trees + Day
+match + delete/insert — several× hashed's single pass — and the authors' own C++
+(Maj_Rule_Plus) benchmarks ~10x slower than TreeTools hashed. So Jansson is very
+unlikely to win any regime. The high-leverage throughput wins are the **R
+wrapper** (high-k/low-n) and **ClusterTable construction** (high-n).
+
+## Threshold-convention note (separate finding, not a bug I introduced)
+
+For 0.5 < p < 1, TreeTools keeps `count > k*p` (`thresh=floor(k*p)+1`); ape keeps
+`count >= k*p`. They agree at p=0.5 and p=1. The roxygen says "p or more" (>=) but
+the code is strict (>). Surfacing for MRS; out of scope to change here.
diff --git a/dev/profiling/correctness-gate.R b/dev/profiling/correctness-gate.R
new file mode 100644
index 000000000..1dfae68b3
--- /dev/null
+++ b/dev/profiling/correctness-gate.R
@@ -0,0 +1,197 @@
+# Correctness gate for consensus methods.
+#
+# THE rule (user): "any systematic error is unacceptable". The hard gate is that
+# every TreeTools consensus counting method produces the IDENTICAL set of
+# consensus splits on every fixture; ape::consensus() is an independent oracle
+# (soft-reported, since its tie convention at count == k/2 can differ).
+#
+# Method-pluggable: add Jansson to `methods` once implemented and it is gated
+# automatically. Loads the installed -O2 build named in .libpath.txt (NOT
+# load_all, so the gate reflects shipped behaviour).
+#
+# Run: Rscript dev/profiling/correctness-gate.R
+suppressWarnings(suppressMessages({
+ libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1]
+ library(TreeTools, lib.loc = libpath)
+}))
+
+# ---- canonical split sets (label-based; polarity-normalised to exclude the
+# alphabetically-first label; trivial splits dropped) -----------------------
+# From a packed RawMatrix (rows = splits, bit i = tip i+1 in the shared numbering
+# given by ref_labels).
+splits_from_matrix <- function(m, ref_labels) {
+ n <- length(ref_labels)
+ excl <- 1L # tip 1 (shared numbering) always on excluded side
+ if (is.null(m) || nrow(m) == 0) return(character(0))
+ out <- character(nrow(m))
+ for (r in seq_len(nrow(m))) {
+ bits <- as.logical(rawToBits(as.raw(m[r, ])))[seq_len(n)]
+ if (isTRUE(bits[excl])) bits <- !bits # excluded label on FALSE side
+ side <- which(bits)
+ if (length(side) < 2 || length(side) > n - 2) next
+ out[r] <- paste(sort(side), collapse = ",")
+ }
+ sort(unique(out[out != ""]))
+}
+
+# From a phylo, expressed in the shared index space defined by ref_labels.
+splits_from_phylo <- function(tree, ref_labels) {
+ n <- length(ref_labels)
+ excl <- 1L
+ pp <- ape::prop.part(tree)
+ labs <- attr(pp, "labels")
+ out <- character(0)
+ for (cl in pp) {
+ idx <- match(labs[cl], ref_labels)
+ if (anyNA(idx)) stop("label mismatch in splits_from_phylo")
+ if (excl %in% idx) idx <- setdiff(seq_len(n), idx) # exclude first label
+ if (length(idx) < 2 || length(idx) > n - 2) next
+ out <- c(out, paste(sort(idx), collapse = ","))
+ }
+ sort(unique(out))
+}
+
+# ---- method registry: forest (already RenumberTips+Preorder) , p -> split set --
+ref_labels_of <- function(forest) forest[[1]][["tip.label"]]
+
+methods <- list(
+ hashed = function(forest, p) splits_from_matrix(
+ TreeTools:::consensus_tree(forest, p, exact = FALSE), ref_labels_of(forest)),
+ exact = function(forest, p) splits_from_matrix(
+ TreeTools:::consensus_tree(forest, p, exact = TRUE), ref_labels_of(forest))
+ # jansson = function(forest, p) splits_from_matrix(
+ # TreeTools:::consensus_tree(forest, p, method = "jansson"), ref_labels_of(forest))
+)
+# ape oracle (soft): only defined for p in [0.5, 1].
+ape_oracle <- function(forest, p)
+ splits_from_phylo(ape::consensus(forest, p = p), ref_labels_of(forest))
+
+fails <- 0L; soft <- 0L; checks <- 0L
+hard_fail <- function(msg) { cat(" HARD FAIL:", msg, "\n"); fails <<- fails + 1L }
+soft_fail <- function(msg) { cat(" (soft) ape disagree:", msg, "\n"); soft <<- soft + 1L }
+
+# Compare all registered methods to each other (HARD) and to ape (SOFT).
+gate_one <- function(forest, p, tag) {
+ checks <<- checks + 1L
+ res <- lapply(methods, function(f) f(forest, p))
+ base <- res[[1]]
+ for (nm in names(res)[-1]) {
+ if (!identical(base, res[[nm]]))
+ hard_fail(sprintf("%s p=%.3g: %s != %s", tag, p, names(res)[1], nm))
+ }
+ # ape is a valid oracle ONLY at p = 0.5: there ">half" matches TreeTools'
+ # `count > k*p`. For p > 0.5, ape keeps `count >= k*p` (>=) while TreeTools
+ # keeps `count > k*p` (>) — a deliberate, documented-as-"p or more" but
+ # actually-strict TreeTools convention. So we only cross-check ape at p=0.5;
+ # the threshold path (p>0.5) is gated by internal hashed==exact==jansson
+ # agreement (the `count >= thresh` filter is shared by all methods).
+ if (isTRUE(all.equal(p, 0.5))) {
+ ok_ape <- tryCatch(ape_oracle(forest, p),
+ error = function(e) structure("ERR", msg = conditionMessage(e)))
+ if (identical(as.vector(ok_ape), "ERR")) {
+ soft_fail(sprintf("%s p=0.5: ape_oracle ERRORED: %s", tag, attr(ok_ape, "msg")))
+ } else if (!identical(base, ok_ape)) {
+ soft_fail(sprintf("%s p=0.5 (n kept: ours=%d ape=%d)",
+ tag, length(base), length(ok_ape)))
+ }
+ }
+}
+
+prep <- function(forest) {
+ forest <- structure(lapply(forest, function(t) {
+ t[["edge.length"]] <- NULL; t[["node.label"]] <- NULL; t
+ }), class = "multiPhylo")
+ Preorder(RenumberTips(forest, forest[[1]]))
+}
+
+ps <- c(0.5, 0.55, 0.6, 2/3, 0.75, 0.8, 0.9, 0.99)
+
+set.seed(1)
+cat("== random forests (shape: rtree) ==\n")
+for (n_tip in c(4, 5, 6, 8, 13, 20, 50)) {
+ for (k in c(2, 3, 5, 7, 8, 15, 32, 60)) {
+ forest <- prep(lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL)))
+ for (p in ps) gate_one(forest, p, sprintf("rand n=%d k=%d", n_tip, k))
+ }
+}
+
+cat("== tall forests (random caterpillars) ==\n")
+for (n_tip in c(8, 20, 60)) {
+ labs <- paste0("t", seq_len(n_tip))
+ for (k in c(3, 7, 16)) {
+ forest <- prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs))))
+ for (p in ps) gate_one(forest, p, sprintf("tall n=%d k=%d", n_tip, k))
+ }
+}
+
+cat("== adversarial structured fixtures ==\n")
+adversarial <- list(
+ all_identical = rep(list(BalancedTree(8)), 7),
+ star_disagree = list(ape::read.tree(text = "((a,b),(c,d));"),
+ ape::read.tree(text = "((a,c),(b,d));")),
+ single_split = list(ape::read.tree(text = "((a,b,c,d),(e,f,g));"),
+ ape::read.tree(text = "((a,b,c,d),(e,f,g));")),
+ one_off_bal = c(rep(list(BalancedTree(10)), 3), list(PectinateTree(10))),
+ threshold_tie = c(rep(list(BalancedTree(8)), 2), rep(list(PectinateTree(8)), 2)),
+ mixed_resoln = list(BalancedTree(8), CollapseNode(BalancedTree(8), 11:12),
+ PectinateTree(8)),
+ bal_vs_pec = list(BalancedTree(8), PectinateTree(8))[c(1,1,1,1,2,2,2)]
+)
+for (nm in names(adversarial)) {
+ forest <- prep(adversarial[[nm]])
+ for (p in ps) gate_one(forest, p, nm)
+}
+
+# ---- HIGH-N content gate (verifies the deferred-materialisation witness path
+# at n where survivors are actually rebuilt from witnesses; the rest of the
+# gate only reaches n<=50). hashed split-SET must equal exact split-SET. ----
+cat("== high-n content (deferred-materialisation witness path) ==\n")
+set.seed(11)
+for (n_tip in c(1000L, 3000L)) {
+ labs <- paste0("t", seq_len(n_tip))
+ hi <- list(
+ # survivor-heavy: ~n-3 majority splits get materialised from witnesses
+ concord70 = { base <- ape::rtree(n_tip, br = NULL); nb <- 9L
+ c(rep(list(base), nb), lapply(seq_len(4L),
+ function(i) ape::rtree(n_tip, br = NULL, tip.label = base$tip.label))) },
+ identical = rep(list(ape::rtree(n_tip, br = NULL)), 12L),
+ # 0-survivor witness path (random caterpillars -> star consensus)
+ tall = lapply(seq_len(12L), function(i) PectinateTree(sample(labs)))
+ )
+ for (nm in names(hi)) {
+ forest <- prep(hi[[nm]])
+ for (p in c(0.5, 2/3)) gate_one(forest, p, sprintf("HIGHN %s n=%d", nm, n_tip))
+ }
+}
+
+# ---- HIGH-N SplitFrequency content+COUNT gate. The witness change also rewired
+# calc_split_frequencies_hashed -> frequencies_list_from_witnesses, whose
+# materialise-all + RawMatrix packing is otherwise only checked at n<=30.
+# hashed must equal exact as a (split -> count) MAP at shipping scale. ------
+cat("== high-n SplitFrequency (hashed vs exact: split sets AND counts) ==\n")
+freq_keyset <- function(forest, exact) {
+ sf <- TreeTools:::split_frequencies(forest, exact = exact)
+ m <- sf[["splits"]]; cnt <- sf[["counts"]]
+ if (is.null(m) || nrow(m) == 0) return(character(0))
+ n <- length(ref_labels_of(forest))
+ out <- character(nrow(m))
+ for (r in seq_len(nrow(m))) {
+ bits <- as.logical(rawToBits(as.raw(m[r, ])))[seq_len(n)]
+ if (isTRUE(bits[1])) bits <- !bits # tip 1 on excluded side (as elsewhere)
+ out[r] <- paste0(paste(which(bits), collapse = ","), "=", cnt[r])
+ }
+ sort(out)
+}
+set.seed(23)
+for (n_tip in c(2000L, 5000L)) {
+ forest <- prep(lapply(seq_len(12L), function(i) ape::rtree(n_tip, br = NULL)))
+ checks <- checks + 1L
+ if (!identical(freq_keyset(forest, FALSE), freq_keyset(forest, TRUE))) {
+ hard_fail(sprintf("SplitFrequency hashed != exact (split/count map) n=%d", n_tip))
+ }
+}
+
+cat(sprintf("\n==== %s ==== %d checks; %d HARD fails; %d soft (ape) disagreements\n",
+ if (fails == 0) "PASS" else "FAIL", checks, fails, soft))
+if (soft > 0) cat("NOTE: soft ape disagreements are usually count==k*p tie-convention; inspect if unexpected.\n")
+quit(status = if (fails == 0) 0L else 1L)
diff --git a/dev/profiling/drivers/c004-equiv.R b/dev/profiling/drivers/c004-equiv.R
new file mode 100644
index 000000000..28076ac7f
--- /dev/null
+++ b/dev/profiling/drivers/c004-equiv.R
@@ -0,0 +1,53 @@
+# C-004 regression: full Consensus() output must be byte-identical before/after
+# replacing the metadata-strip lapply with bare c(trees).
+# Run once on the OLD code (saves ref), once on the NEW code (compares).
+suppressMessages(devtools::load_all(".", quiet = TRUE))
+set.seed(42)
+ref_path <- "dev/profiling/c004-ref.rds"
+
+base <- lapply(1:9, function(i) ape::rtree(12, br = NULL))
+mp <- structure(RenumberTips(structure(base, class = "multiPhylo"), base[[1]]),
+ class = "multiPhylo") # aligned order
+mpU <- structure(base, class = "multiPhylo") # UNaligned order
+addEL <- function(f) structure(lapply(f, function(t){t$edge.length<-seq_len(nrow(t$edge)); t}), class="multiPhylo")
+addNL <- function(f) structure(lapply(f, function(t){t$node.label<-paste0("n",seq_len(t$Nnode)); t}), class="multiPhylo")
+diff_sz <- structure(c(base[1:5], lapply(base[6:9], function(t) DropTip(t, 1:2))), class="multiPhylo")
+
+variants <- list(
+ plain = mp,
+ unaligned = mpU, # exercises check.labels=TRUE relabel
+ edge_len = addEL(mp),
+ node_lab = addNL(mp),
+ both = addNL(addEL(mp)),
+ labelled = ape::.compressTipLabel(mp),
+ lab_el = ape::.compressTipLabel(addEL(mp)),
+ list_in = unclass(mp), # plain list, not multiPhylo
+ diff_size = diff_sz # exercises the KeepTip repeat loop
+)
+grid <- expand.grid(v = names(variants), p = c(0.5, 2/3, 1),
+ check = c(TRUE, FALSE), hash = c(TRUE, FALSE),
+ stringsAsFactors = FALSE)
+canon <- function(tr) {
+ if (!inherits(tr, "phylo")) return(tr)
+ list(edge = Preorder(RootTree(tr, 1))$edge, tip = tr$tip.label, Nnode = tr$Nnode)
+}
+out <- Map(function(v, p, check, hash)
+ tryCatch(canon(suppressWarnings(Consensus(variants[[v]], p, check, hash))),
+ error = function(e) paste("ERR:", conditionMessage(e))),
+ grid$v, grid$p, grid$check, grid$hash)
+
+if (!file.exists(ref_path)) {
+ saveRDS(out, ref_path)
+ cat(sprintf("Saved reference: %d cells -> %s\n", length(out), ref_path))
+} else {
+ ref <- readRDS(ref_path)
+ same <- mapply(identical, ref, out)
+ if (all(same)) {
+ cat(sprintf("PASS: all %d Consensus() outputs identical to reference.\n", length(out)))
+ } else {
+ cat(sprintf("FAIL: %d/%d cells differ:\n", sum(!same), length(out)))
+ bad <- which(!same)
+ for (i in bad) cat(sprintf(" v=%-9s p=%.3g check=%s hash=%s\n",
+ grid$v[i], grid$p[i], grid$check[i], grid$hash[i]))
+ }
+}
diff --git a/dev/profiling/drivers/compare-grids.R b/dev/profiling/drivers/compare-grids.R
new file mode 100644
index 000000000..3d6bcc18f
--- /dev/null
+++ b/dev/profiling/drivers/compare-grids.R
@@ -0,0 +1,34 @@
+# Compare two benchmark-grid CSVs (baseline vs optimised) and print per-cell
+# speedups. Run after consensus-grid.R has overwritten results-grid.csv.
+# Run: Rscript dev/profiling/drivers/compare-grids.R [old.csv] [new.csv]
+a <- commandArgs(TRUE)
+oldf <- if (length(a) >= 1) a[1] else "dev/profiling/results-grid-baseline.csv"
+newf <- if (length(a) >= 2) a[2] else "dev/profiling/results-grid.csv"
+o <- read.csv(oldf); n <- read.csv(newf)
+key <- function(d) paste(d$regime, d$scenario, d$n, d$k, d$method, sep = "|")
+o$key <- key(o); n$key <- key(n)
+m <- merge(o[, c("key", "min_s", "nsplits")],
+ n[, c("key", "min_s", "nsplits")], by = "key",
+ suffixes = c("_old", "_new"))
+m$speedup <- round(m$min_s_old / m$min_s_new, 2)
+m$changed_nsplits <- m$nsplits_old != m$nsplits_new # MUST be all FALSE (correctness)
+parts <- do.call(rbind, strsplit(m$key, "|", fixed = TRUE))
+m$regime <- parts[, 1]; m$scenario <- parts[, 2]; m$n <- as.integer(parts[, 3])
+m$k <- as.integer(parts[, 4]); m$method <- parts[, 5]
+m <- m[order(m$method, m$regime, m$n, m$k, m$scenario), ]
+
+cat("== per-cell speedup (old/new min_s); >1 = faster ==\n")
+for (i in seq_len(nrow(m))) {
+ cat(sprintf("%-7s %-11s %-9s n=%-5d k=%-5d %7.4f -> %7.4f x%-5.2f %s\n",
+ m$method[i], m$regime[i], m$scenario[i], m$n[i], m$k[i],
+ m$min_s_old[i], m$min_s_new[i], m$speedup[i],
+ if (m$changed_nsplits[i]) " *** NSPLITS CHANGED ***" else ""))
+}
+if (any(m$changed_nsplits)) {
+ cat("\n!!! CORRECTNESS REGRESSION: nsplits changed in some cell !!!\n")
+} else {
+ cat("\nnsplits identical in every cell (no correctness change).\n")
+}
+cat(sprintf("hashed median speedup: x%.2f; exact median speedup: x%.2f\n",
+ median(m$speedup[m$method == "hashed"]),
+ median(m$speedup[m$method == "exact"])))
diff --git a/dev/profiling/drivers/consensus-grid.R b/dev/profiling/drivers/consensus-grid.R
new file mode 100644
index 000000000..f6e1ee2c8
--- /dev/null
+++ b/dev/profiling/drivers/consensus-grid.R
@@ -0,0 +1,121 @@
+# Baseline benchmark grid for consensus counting methods.
+#
+# Times the C++ CORE (TreeTools:::consensus_tree) on forests that are already
+# RenumberTips+Preorder'd, so we compare ALGORITHMS apples-to-apples (the shared
+# R-wrapper cost is measured separately at the end). Covaries the three regimes
+# the user named (high-k/low-n, low-k/high-n, high-k/high-n) with tree SHAPE
+# (random vs tall caterpillar) and CONCORDANCE (random / 70%-concordant /
+# identical) — the covariates that drive exact's O(k.n.h) and (later) Jansson.
+#
+# Method dispatch is a single place (`run_method`) so Jansson slots in later.
+# Writes a tidy CSV; guards pathological exact cells with a warmup probe.
+#
+# Run: Rscript dev/profiling/drivers/consensus-grid.R
+suppressWarnings(suppressMessages({
+ libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1]
+ library(TreeTools, lib.loc = libpath)
+}))
+set.seed(5813)
+
+OUT <- "dev/profiling/results-grid.csv"
+REPS <- 4L # timed reps after 1 warmup
+PROBE_BUDGET_S <- 8 # if the warmup rep exceeds this, record it alone & skip reps
+
+timeit <- function(expr) {
+ expr <- substitute(expr)
+ env <- parent.frame()
+ eval(expr, env) # warmup (also a probe)
+ t0 <- Sys.time(); eval(expr, env)
+ warm <- as.numeric(Sys.time() - t0, units = "secs")
+ if (warm > PROBE_BUDGET_S) return(list(min = warm, med = warm, reps = 1L))
+ ts <- numeric(REPS)
+ for (i in seq_len(REPS)) {
+ t0 <- Sys.time(); eval(expr, env)
+ ts[i] <- as.numeric(Sys.time() - t0, units = "secs")
+ }
+ list(min = min(ts), med = stats::median(ts), reps = REPS)
+}
+
+# ---- forest generators (each returns a RenumberTips+Preorder'd multiPhylo) ----
+prep <- function(forest) {
+ forest <- structure(forest, class = "multiPhylo")
+ Preorder(RenumberTips(forest, forest[[1]]))
+}
+gen <- list(
+ rand = function(n, k) prep(lapply(seq_len(k), function(i) ape::rtree(n, br = NULL))),
+ tall = function(n, k) { labs <- paste0("t", seq_len(n))
+ prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs)))) },
+ concord70 = function(n, k) { base <- ape::rtree(n, br = NULL)
+ nb <- ceiling(0.7 * k)
+ prep(c(rep(list(base), nb),
+ lapply(seq_len(k - nb), function(i) ape::rtree(n, br = NULL,
+ tip.label = base$tip.label)))) },
+ identical = function(n, k) { base <- ape::rtree(n, br = NULL)
+ prep(rep(list(base), k)) }
+)
+
+# ---- method dispatch (Jansson added here later) -------------------------------
+run_method <- function(forest, p, method) {
+ switch(method,
+ hashed = TreeTools:::consensus_tree(forest, p, exact = FALSE),
+ exact = TreeTools:::consensus_tree(forest, p, exact = TRUE),
+ # jansson = TreeTools:::consensus_tree(forest, p, method = "jansson"),
+ stop("unknown method"))
+}
+METHODS <- c("hashed", "exact")
+
+# ---- grid: (regime, n, k) -----------------------------------------------------
+grid <- rbind(
+ data.frame(regime = "baseline", n = 100, k = 100),
+ data.frame(regime = "highK_lowN", n = 30, k = 1000),
+ data.frame(regime = "highK_lowN", n = 30, k = 5000),
+ data.frame(regime = "highK_lowN", n = 50, k = 2000),
+ data.frame(regime = "lowK_highN", n = 2000, k = 10),
+ data.frame(regime = "lowK_highN", n = 5000, k = 10),
+ data.frame(regime = "lowK_highN", n = 10000, k = 5),
+ data.frame(regime = "highK_highN", n = 1000, k = 300),
+ data.frame(regime = "highK_highN", n = 1000, k = 1000),
+ data.frame(regime = "highK_highN", n = 2000, k = 500)
+)
+SCENARIOS <- names(gen)
+P <- 0.5
+
+rows <- list()
+first <- TRUE
+for (gi in seq_len(nrow(grid))) {
+ n <- grid$n[gi]; k <- grid$k[gi]; regime <- grid$regime[gi]
+ for (sc in SCENARIOS) {
+ tg0 <- Sys.time()
+ forest <- gen[[sc]](n, k)
+ gen_s <- as.numeric(Sys.time() - tg0, units = "secs")
+ for (m in METHODS) {
+ tr <- timeit(run_method(forest, P, m))
+ res <- run_method(forest, P, m)
+ nsplits <- if (is.null(res)) 0L else nrow(res)
+ row <- data.frame(regime = regime, scenario = sc, n = n, k = k,
+ method = m, p = P, min_s = round(tr$min, 5),
+ med_s = round(tr$med, 5), reps = tr$reps,
+ nsplits = nsplits, gen_s = round(gen_s, 3))
+ rows[[length(rows) + 1]] <- row
+ cat(sprintf("%-12s %-9s n=%-5d k=%-5d %-7s min=%8.4f med=%8.4f reps=%d nsplits=%d\n",
+ regime, sc, n, k, m, tr$min, tr$med, tr$reps, nsplits))
+ utils::write.table(row, OUT, sep = ",", row.names = FALSE,
+ col.names = first, append = !first)
+ first <- FALSE
+ }
+ }
+}
+
+# ---- R-wrapper overhead: Consensus() (full pipeline) vs core-only -------------
+cat("\n== wrapper overhead (end-to-end Consensus vs core consensus_tree) ==\n")
+for (cfg in list(c(30, 5000), c(100, 100), c(2000, 10), c(1000, 1000))) {
+ n <- cfg[1]; k <- cfg[2]
+ forest <- gen$rand(n, k)
+ # raw (unprepared) copy to feed Consensus(), so its RenumberTips/Preorder run
+ raw <- structure(lapply(seq_len(k), function(i) forest[[i]]), class = "multiPhylo")
+ te <- timeit(Consensus(raw, p = 0.5, check.labels = TRUE))
+ tc <- timeit(TreeTools:::consensus_tree(forest, 0.5, exact = FALSE))
+ cat(sprintf("n=%-5d k=%-5d Consensus()=%8.4f core=%8.4f wrapper=%8.4f (%.0f%%)\n",
+ n, k, te$min, tc$min, te$min - tc$min, 100 * (te$min - tc$min) / te$min))
+}
+cat("\nWrote", OUT, "\n")
diff --git a/dev/profiling/drivers/gha-bench-diagnose.R b/dev/profiling/drivers/gha-bench-diagnose.R
new file mode 100644
index 000000000..c57dfbb9e
--- /dev/null
+++ b/dev/profiling/drivers/gha-bench-diagnose.R
@@ -0,0 +1,56 @@
+# Why do the GHA bench-Consensus.R cells show NSD after C-001/C-002?
+# Diagnose: which path each benchmark exercises + wrapper cost breakdown.
+suppressMessages(devtools::load_all(".", quiet = TRUE))
+
+forest201.80 <- as.phylo(0:200, 80)
+forest21.260 <- as.phylo(0:20, 260)
+forest1k.888 <- as.phylo(0:1000, 888)
+forestMaj <- c(as.phylo(rep(c(0, 1e9), c(15, 50)), 100),
+ structure(lapply(rep(100, 50), BalancedTree), class = "multiPhylo"))
+
+describe <- function(nm, f) {
+ cls <- class(f)
+ tl <- attr(f, "TipLabel")
+ has_el <- any(vapply(f, function(t) !is.null(t[["edge.length"]]), logical(1)))
+ has_nl <- any(vapply(f, function(t) !is.null(t[["node.label"]]), logical(1)))
+ # do all trees share tip.label in identical order?
+ if (is.null(tl)) {
+ labs <- lapply(f, function(t) t[["tip.label"]])
+ same_order <- all(vapply(labs[-1], identical, logical(1), labs[[1]]))
+ } else same_order <- NA # labelled: order shared by construction
+ cat(sprintf("%-14s k=%-5d n=%-4d class=%-10s TipLabel=%-5s edge.len=%-5s node.lab=%-5s sameTipOrder=%s\n",
+ nm, length(f), NTip(f[[1]]), paste(cls, collapse=","),
+ !is.null(tl), has_el, has_nl, same_order))
+}
+cat("== forest structure ==\n")
+describe("forest201.80", forest201.80)
+describe("forest21.260", forest21.260)
+describe("forest1k.888", forest1k.888)
+describe("forestMaj", forestMaj)
+
+# ---- Wrapper cost breakdown (replicate Consensus() internals) -----------------
+bk <- function(label, expr, times = 25) {
+ expr <- substitute(expr)
+ e <- parent.frame()
+ t <- replicate(times, { g <- gc(FALSE); s <- Sys.time(); eval(expr, e); as.numeric(Sys.time() - s, units = "secs") })
+ cat(sprintf(" %-26s %8.2f ms (min %6.2f)\n", label, 1000*median(t), 1000*min(t)))
+ invisible(median(t))
+}
+
+breakdown <- function(nm, trees, p = 1, check.labels = TRUE) {
+ cat(sprintf("\n== %s (p=%g, check.labels=%s) ==\n", nm, p, check.labels))
+ hash <- TRUE
+ bk("full Consensus()", Consensus(trees, p, check.labels, hash))
+ bk("metadata-strip lapply", lapply(c(trees), function(tr) { tr[["edge.length"]] <- NULL; tr[["node.label"]] <- NULL; tr }))
+ stripped <- lapply(c(trees), function(tr) { tr[["edge.length"]] <- NULL; tr[["node.label"]] <- NULL; tr })
+ bk("NTip(trees)", NTip(stripped))
+ if (check.labels) bk("RenumberTips", RenumberTips(stripped, stripped[[1]]))
+ rt <- if (check.labels) RenumberTips(stripped, stripped[[1]]) else stripped
+ bk("Preorder(trees[[1]])", Preorder(rt[[1]]))
+ bk("consensus_tree core", TreeTools:::consensus_tree(rt, p, exact = !hash))
+}
+
+breakdown("forest1k.888", forest1k.888, p = 1, check.labels = FALSE)
+breakdown("forest201.80", forest201.80, p = 1, check.labels = FALSE)
+breakdown("forest21.260", forest21.260, p = 1, check.labels = TRUE)
+breakdown("forestMaj", forestMaj, p = 0.5, check.labels = FALSE)
diff --git a/dev/profiling/drivers/jansson-bound.R b/dev/profiling/drivers/jansson-bound.R
new file mode 100644
index 000000000..dd53fc0f1
--- /dev/null
+++ b/dev/profiling/drivers/jansson-bound.R
@@ -0,0 +1,77 @@
+# Measured lower bound on Jansson (Maj_Rule_Plus) cost, without implementing it.
+#
+# Bound (advisor-validated):
+# Phase 2 of Maj_Rule_Plus ~= k Day-matchings of the candidate tree against
+# each input ~= the STRICT path (consensus_tree p=1): k table builds + k O(n)
+# matching passes.
+# Phase 1 >= Phase 2 (same matching PLUS One_Way_Compatible + Merge_Trees +
+# delete/insert + a candidate-table rebuild each iteration).
+# => Jansson total >= 2 * strict-path.
+# Strict's inner loop (single-reference contiguity + CLUSTONL/R, no hash map) is
+# TIGHTER than a real Phase 2, so this UNDER-counts Jansson => a "2*strict >=
+# hashed" verdict is conservative.
+#
+# `rand` forests have NO strict early-exit (strict consensus is a star: 0 of n-3
+# splits found, never hits the perfectly-resolved return) => k full passes, the
+# worst case for strict. We also test `tall`.
+#
+# Verdict per cell: if 2*strict >= hashed, Jansson cannot win there.
+# Run: Rscript dev/profiling/drivers/jansson-bound.R
+suppressWarnings(suppressMessages({
+ libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1]
+ library(TreeTools, lib.loc = libpath)
+}))
+set.seed(5813)
+REPS <- 4L
+timeit <- function(expr) {
+ expr <- substitute(expr); env <- parent.frame()
+ eval(expr, env)
+ ts <- numeric(REPS)
+ for (i in seq_len(REPS)) { t0 <- Sys.time(); eval(expr, env)
+ ts[i] <- as.numeric(Sys.time() - t0, units = "secs") }
+ min(ts)
+}
+prep <- function(f) { f <- structure(f, class = "multiPhylo"); Preorder(RenumberTips(f, f[[1]])) }
+gen <- list(
+ rand = function(n, k) prep(lapply(seq_len(k), function(i) ape::rtree(n, br = NULL))),
+ tall = function(n, k) { labs <- paste0("t", seq_len(n))
+ prep(lapply(seq_len(k), function(i) PectinateTree(sample(labs)))) },
+ # 70% identical base + 30% random: strict consensus is still ~star (the random
+ # 30% kill the base's splits) so strict does k full passes => bound stays valid,
+ # while distinct-split count is bounded => a more realistic-leaning case.
+ concord70 = function(n, k) { base <- ape::rtree(n, br = NULL); nb <- ceiling(0.7 * k)
+ prep(c(rep(list(base), nb), lapply(seq_len(k - nb), function(i)
+ ape::rtree(n, br = NULL, tip.label = base$tip.label)))) },
+ # MODERATE conflict (realistic gene-tree / bootstrap-with-rogues): each
+ # replicate = base with ~5% of taxa (rogues) relocated to random edges. The
+ # majority IS well resolved (rogues each break only a few replicates' backbone
+ # splits) but strict stays near-star (some replicate breaks each split) => no
+ # early-exit => bound valid. Distinct-split count ~ n + O(k*rogues): moderate.
+ moderate = function(n, k) { base <- ape::rtree(n, br = NULL)
+ m <- min(25L, max(3L, n %/% 50L))
+ prep(lapply(seq_len(k), function(i) {
+ rogues <- sample(base[["tip.label"]], m)
+ t <- DropTip(base, rogues)
+ for (r in rogues) t <- AddTip(t, label = r) # default where = random edge
+ t
+ })) }
+)
+cells <- list(c(30,5000), c(50,2000), c(5000,10), c(10000,5), c(2000,500), c(1000,1000))
+
+cat(sprintf("%-6s %-5s %-6s %10s %10s %12s %s\n",
+ "shape","n","k","hashed","strict","2*strict","verdict"))
+worst_ratio <- Inf
+for (sc in c("rand","tall","concord70","moderate")) for (cell in cells) {
+ n <- cell[1]; k <- cell[2]
+ f <- gen[[sc]](n, k)
+ th <- timeit(TreeTools:::consensus_tree(f, 0.5, exact = FALSE)) # hashed majority
+ ts <- timeit(TreeTools:::consensus_tree(f, 1)) # strict (Jansson lower-bound unit)
+ bound <- 2 * ts
+ verdict <- if (bound >= th) "Jansson can't win" else "INCONCLUSIVE — implement"
+ worst_ratio <- min(worst_ratio, bound / th)
+ cat(sprintf("%-6s %-5d %-6d %10.4f %10.4f %12.4f %s (2*strict/hashed=%.2f)\n",
+ sc, n, k, th, ts, bound, verdict, bound / th))
+}
+cat(sprintf("\nWorst-case 2*strict/hashed across all cells: %.2f\n", worst_ratio))
+cat(if (worst_ratio >= 1) "==> CONCLUSIVE: Jansson cannot beat hashed in any tested regime.\n"
+ else "==> Some cell inconclusive; implement Jansson there and measure.\n")
diff --git a/dev/profiling/drivers/renumber-bench.R b/dev/profiling/drivers/renumber-bench.R
new file mode 100644
index 000000000..cef90770e
--- /dev/null
+++ b/dev/profiling/drivers/renumber-bench.R
@@ -0,0 +1,41 @@
+# Benchmark the RenumberTips unlabelled fast path (C++ renumber_tips_to) against
+# the per-tree R path it replaces, and the resulting Consensus() wrapper share.
+# Run: Rscript dev/profiling/drivers/renumber-bench.R
+suppressWarnings(suppressMessages({
+ libpath <- readLines("dev/profiling/.libpath.txt", warn = FALSE)[1]
+ library(TreeTools, lib.loc = libpath)
+}))
+set.seed(5813)
+REPS <- 6L
+tt <- function(expr) { e <- substitute(expr); env <- parent.frame()
+ eval(e, env); ts <- numeric(REPS)
+ for (i in seq_len(REPS)) { t0 <- Sys.time(); eval(e, env)
+ ts[i] <- as.numeric(Sys.time() - t0, units = "secs") }
+ min(ts) }
+
+# Per-tree R reference path (what RenumberTips.multiPhylo used to do for an
+# unlabelled forest): apply RenumberTips to each tree individually.
+r_path <- function(forest, target) lapply(forest, RenumberTips, target)
+
+cat(sprintf("%-6s %-6s %12s %12s %9s\n", "n", "k", "R per-tree", "C++ batch", "speedup"))
+for (cfg in list(c(30,5000), c(50,2000), c(100,1000), c(500,300), c(2000,100))) {
+ n <- cfg[1]; k <- cfg[2]
+ forest <- lapply(seq_len(k), function(i) ape::rtree(n, br = NULL)) # shuffled orders
+ target <- paste0("t", seq_len(n))
+ mp <- structure(forest, class = "multiPhylo")
+ tR <- tt(r_path(forest, target))
+ tC <- tt(RenumberTips(mp, target))
+ cat(sprintf("%-6d %-6d %12.4f %12.4f %8.2fx\n", n, k, tR, tC, tR / tC))
+}
+
+cat("\n== Consensus() end-to-end vs core (new wrapper share) ==\n")
+for (cfg in list(c(30,5000), c(100,100), c(2000,10))) {
+ n <- cfg[1]; k <- cfg[2]
+ forest <- lapply(seq_len(k), function(i) ape::rtree(n, br = NULL))
+ raw <- structure(forest, class = "multiPhylo")
+ pre <- Preorder(RenumberTips(raw, raw[[1]]))
+ te <- tt(Consensus(raw, p = 0.5))
+ tc <- tt(TreeTools:::consensus_tree(pre, 0.5, exact = FALSE))
+ cat(sprintf("n=%-5d k=%-5d Consensus()=%8.4f core=%8.4f wrapper=%8.4f (%.0f%%)\n",
+ n, k, te, tc, te - tc, 100 * (te - tc) / te))
+}
diff --git a/dev/profiling/drivers/tie-check.R b/dev/profiling/drivers/tie-check.R
new file mode 100644
index 000000000..f20f59768
--- /dev/null
+++ b/dev/profiling/drivers/tie-check.R
@@ -0,0 +1,67 @@
+#!/usr/bin/env Rscript
+# Settle the p = 0.5 exact-tie question empirically:
+# - what does TreeTools::Consensus drop/keep at exactly 50%?
+# - does it agree with ape::consensus at the tie boundary?
+# - does it agree with the documented "proportion p or more" semantics?
+suppressMessages(pkgload::load_all(quiet = TRUE)) # run from the package root
+
+cat("TreeTools loaded\n\n")
+
+splitsOf <- function(tr) {
+ s <- as.Splits(tr)
+ sort(vapply(as.character(s), identity, character(1)))
+}
+
+report <- function(label, trees, p) {
+ n <- length(trees)
+ trees <- RenumberTips(trees, trees[[1]])
+ tt <- Consensus(trees, p = p)
+ ttx <- Consensus(trees, p = p, hash = FALSE)
+ ap <- ape::consensus(trees, p = p)
+ thresh <- floor(n * p) + 1
+ cat(sprintf("== %s : n=%d, p=%.3f, thresh=floor(n*p)+1=%d (count >= %d to keep)\n",
+ label, n, p, thresh, thresh))
+ cat(sprintf(" TreeTools Nnode (hashed) = %d ; (exact) = %d ; ape Nnode = %d\n",
+ tt$Nnode, ttx$Nnode, ap$Nnode))
+ agreeHashExact <- isTRUE(all.equal(RootTree(tt, 1), RootTree(ttx, 1)))
+ agreeApe <- isTRUE(all.equal(RootTree(tt, 1), RootTree(ap, 1)))
+ cat(sprintf(" hashed==exact : %s ; TreeTools==ape : %s\n",
+ agreeHashExact, agreeApe))
+ if (!agreeApe) {
+ cat(" TreeTools splits:\n"); print(splitsOf(tt))
+ cat(" ape splits:\n"); print(splitsOf(ap))
+ }
+ cat("\n")
+ invisible(list(tt = tt, ape = ap, agreeApe = agreeApe))
+}
+
+# 1. The canonical even tie: 2 + 2 conflicting topologies (the test-Consensus tie fixture)
+tie <- c(rep(list(BalancedTree(8)), 2L), rep(list(PectinateTree(8)), 2L))
+report("tie 2xBal + 2xPec (8 tip)", tie, 0.5)
+
+# 2. Minimal 2-tree conflict (n = 2, each conflicting split at exactly 50%)
+two <- list(ape::read.tree(text = "((a, b), (c, d));"),
+ ape::read.tree(text = "((a, c), (b, d));"))
+report("2 conflicting quartets", two, 0.5)
+
+# 3. A controlled exact-50% on ONE clade: 4 trees, 2 share clade {t1,t2}, 2 share {t2,t3}
+# (rest identical) so exactly one pair of conflicting splits sits at 2/4.
+mk <- function(txt) ape::read.tree(text = txt)
+ctrl <- list(
+ mk("(((t1,t2),t3),t4,t5,t6);"),
+ mk("(((t1,t2),t3),t4,t5,t6);"),
+ mk("((t1,(t2,t3)),t4,t5,t6);"),
+ mk("((t1,(t2,t3)),t4,t5,t6);")
+)
+report("controlled 2-2 clade tie (6 tip)", ctrl, 0.5)
+
+# 4. Threshold boundary at p = 2/3 with n = 3 (n*p = 2 exactly): is a 2/3 split kept?
+# doc 'p or more' => count/n >= 2/3 => count >= 2 keep; impl floor(2)+1=3 => drop.
+thr <- list(
+ mk("(((t1,t2),t3),t4,t5,t6);"),
+ mk("(((t1,t2),t3),t4,t5,t6);"),
+ mk("((t1,(t2,t3)),t4,t5,t6);")
+)
+report("p=2/3, n=3, clade at exactly 2/3", thr, 2 / 3)
+
+cat("done\n")
diff --git a/dev/profiling/findings.md b/dev/profiling/findings.md
new file mode 100644
index 000000000..e603dc3be
--- /dev/null
+++ b/dev/profiling/findings.md
@@ -0,0 +1,68 @@
+# Consensus profiling findings
+
+Task-row format. P = priority (1 highest). Status: OPEN / DONE / WONTFIX.
+
+| ID | P | Status | Verified Δ | Finding | Evidence |
+|----|---|--------|-----------|---------|----------|
+| C-001 | 1 | DONE | hashed median **x1.23**; tall(10000,5) **x13.0**, tall(5000,10) x8.6, rand(10000,5) x2.5, tall(2000,500) x2.9; nsplits identical in every cell. | [Optimise] hashed: DEFER split-pattern materialisation until thresholding — keep a `(tree,L,R)` witness per distinct split, build the packed pattern only for splits reaching `thresh`. | At high n the dominant hashed cost is per-distinct-split work, most wasted (few/no splits survive on low-concordance input). Strict path (no materialisation) is 5–30× faster than hashed-majority at high n. Implemented in src/consensus.cpp (SplitWitness + materialise_witness). Gate: 590 checks PASS (hashed==exact==ape@0.5; incl. consensus content n≤3000 and SplitFrequency split+count n=2000/5000); test-consensus 8/8, test-Support 6/6, verify-consensus green. Speedup verified: compare-grids.R. |
+| C-002 | 2 | DONE | RenumberTips **1.8–9.0×** (30,5000: 0.335→0.037s 9.0×; 50,2000 6.7×; 100,1000 6.3×; 500,300 2.6×; 2000,100 1.8×). Results identical. | [Optimise] `RenumberTips()` on an unlabelled multiPhylo/list now relabels the whole forest in ONE C++ pass (`renumber_tips_to`), replacing the per-tree R loop; no-op pass-through for trees already in target order. Shared by Consensus() and all RenumberTips callers. Returns NULL → existing R fallback for numeric tipOrder / differing label sets / non-phylo elements, so edge-case behaviour is preserved exactly. | **"Tell without looking?"** Only a *labelled* multiPhylo (TipLabel attr) guarantees a shared order without inspecting each tree; otherwise inspection is unavoidable, but now in C++ (and a matching tree is passed through untouched). Gate: full suite GREEN (296); correctness-gate 590; new equivalence test (C++ == per-tree across mixed orderings). LESSON: the existing RenumberTips test caught that the C++ list initially dropped `names` (the lapply path preserves them) — fixed by copying input names onto the result. Bench: drivers/renumber-bench.R. |
+| C-004 | 2 | DONE | strip→`c(trees)`: forest201.80 3.38→2.54ms (**-25%**), forestMaj 2.60→2.10 (-19%), forest21.260 1.23→1.12 (-9%), forest1k.888 72.1→68.0 (-6%, strict core is 92%). Output byte-identical. | [Optimise] Consensus() metadata-strip replaced by bare `trees <- c(trees)`. The old `lapply(c(trees), \\(tr){tr$edge.length<-NULL; tr$node.label<-NULL; tr})` copied EVERY tree; the consensus core reads only edge+tip.label and the output is rebuilt from scratch, so neither field can change the answer. `c()` alone still materialises a labelled multiPhylo's shared labels onto each tree (the load-bearing job). **Floor finding (key):** the naive presence-guard (`if(!is.null(...))`) was a near-wash — the `is.null` reads cost ~as much as the assignments they skip (0.82→0.75ms), and REGRESSED the metadata-present case; the win comes from skipping the lapply entirely (strip 0.82→c() 0.11ms, ~7x). | gha-bench-diagnose.R: strip was 19–29% of the SMALL forests, 6% of forest1k.888. Equivalence: c004-equiv.R 108 cells (plain/unaligned/edge_len/node_lab/both/labelled/lab_el/list/diff_size × p × check × hash) byte-identical pre/post; core RawMatrix metadata-independent 36 cells; new test-Consensus.R block (edge.length/node.label/labelled, check T/F, p∈{.5,⅔,1}); full suite 4192 PASS; gate 590. |
+| C-003 | 3 | OPEN | — | [Optimise] hashed: large `unordered_map` churn dominates the LOW-height high-distinct (`rand`) case (deferred materialisation does NOT help there — clusters are tiny). Try a flat open-addressing map / better reserve. Asymptotic floor is Ω(#distinct) ops; only Jansson's O(n) candidate tree removes it. | jansson-bound.R: rand high-n strict ≪ hashed; the gap is hashmap ops, not materialisation. Realistic (concordant) inputs have few distinct splits ⇒ small map ⇒ low priority. |
+| C-005 | 3 | OPEN | — | [Optimise] `NTip(trees)` in the Consensus() repeat-loop is now the next-largest wrapper cost after the C-004 strip removal: forest201.80 0.73ms (22% of total), forest1k.888 3.4ms (5%). It builds a full k-vector of tip counts every loop iteration just to test `length(unique(.)) > 1`. Could early-exit / cache / vectorise via the materialised list. Consensus()-local, low risk. | gha-bench-diagnose.R breakdown. |
+
+## Why the GHA `bench-Consensus.R` cells show NSD after C-001/C-002 (2026-06-03)
+
+The shipped wins don't appear in the GHA suite because **none of its 5 cells
+exercise the regimes the optimisations target** (gha-bench-diagnose.R):
+
+- All four forests (`as.phylo(int, n)`) are **metadata-free, already in matching
+ tip order, and high-concordance** (consecutive tree-integers ⇒ few distinct
+ splits).
+- **C-002 (RenumberTips):** 4/5 cells pass `check = FALSE` (partial-matches
+ `check.labels = FALSE`) ⇒ RenumberTips never runs. The one cell that runs it
+ (`forest21.260`, the −6.96% blip) is k=21 and already-ordered, so the C++
+ path's fixed setup doesn't amortise vs the old R loop — real, but NSD.
+- **C-001 (deferred materialisation):** 3/5 cells use default `p = 1` (strict)
+ ⇒ the single-reference path, which never calls `count_splits_hashed`. The 2
+ majority cells (`p = 0.5`) are 1–3 ms and high-concordance ⇒ nothing to defer.
+- The strict C++ core dominates the one big cell (`forest1k.888`: 66 of 72 ms).
+
+To make GHA *demonstrate* the wins, add cells that hit the regimes (OFFERED to
+user, not yet added): a high-k unlabelled forest with **shuffled** tip order
+(real relabel → C-002), and a tall **low-concordance `p < 1`** forest (→ C-001).
+C-004 does move the small cells (strip was 19–29% of them).
+
+## Threshold semantics at `p > 0.5` — open question for maintainer (2026-06-03)
+
+Prompted by "how are p = 0.5 ties resolved?". Two distinct findings (repro:
+`drivers/tie-check.R`):
+
+- **p = 0.5 ties are handled correctly.** `thresh = floor(n·p) + 1`, so a split
+ in exactly n/2 trees (`count = n/2 < thresh`) is **dropped**. Two conflicting
+ 50% splits are both dropped ⇒ valid tree. Provably correct (any two retained
+ splits each occur in > n/2 trees ⇒ counts sum > n ⇒ co-occur ⇒ compatible) and
+ **matches `ape::consensus`** (which bumps `p <- 0.5000001` to the same effect).
+ Now pinned by `ApeTest(tie, 0.5)` in test-Consensus.R (the even-n tie fixture
+ was previously only in the hashed==exact test — internal counters only, no
+ oracle).
+
+- **p > 0.5 was off-by-one *stricter* than the docstring and `ape` — FIXED.**
+ The docstring promises "a proportion `p` or more"; `ape` keeps `count >= p·n`.
+ The old code kept `count >= floor(n·p) + 1`, diverging when `n·p` is an exact
+ integer k (ape keeps `count = k`; old code required `count >= k+1`; e.g. n=3,
+ p=2/3 dropped a clade in 2 of 3 trees). **Maintainer chose: align code to
+ "p or more".** consensus.cpp now uses `thresh = max(floor(n/2)+1, ceil(p·n))`
+ with a relative tolerance (`1e-9·pn`) snapping `p·n` to an exact integer
+ through FP noise — robust where ape's raw `>=` is fragile. p = 0.5 unchanged
+ (strict majority, via the `floor(n/2)+1` floor). Pinned by `ApeTest(thirds,
+ 2/3)` (n=3 boundary). Docstring + NEWS updated. Gate 590 PASS / 0 ape
+ disagreements; full suite 4195 PASS.
+
+## Jansson decision (tracking)
+
+Open question: does deterministic Jansson (Maj_Rule_Plus, O(kn)) beat OPTIMISED
+hashed in any *realistic* regime? Realistic consensus inputs are concordant (few
+distinct splits) where hashed is O(kn) tiny-constant and wins. Jansson's only
+edge is high-n LOW-concordance (≈ independent random trees — not a real consensus
+input). Lower bound: Jansson ≥ 2×strict. Decision pending grid-v2 + a realistic
+high-concordance scenario. See log.md / PLAN-consensus.md.
diff --git a/dev/profiling/focus-areas.md b/dev/profiling/focus-areas.md
new file mode 100644
index 000000000..efdb6c61c
--- /dev/null
+++ b/dev/profiling/focus-areas.md
@@ -0,0 +1,16 @@
+# Profiling focus areas — Consensus
+
+Ranked by (per-call cost) × (call frequency on the consensus hot path).
+See PLAN-consensus.md for the broader project.
+
+| # | Area | Files | Why hot | Baseline cost | Last profiled | Status |
+|---|------|-------|---------|---------------|---------------|--------|
+| 1 | `count_splits_hashed` (majority default) | src/consensus.cpp:267 | One postorder pass/tree; the default majority counter. O(kn). | tbd | — | NEW |
+| 2 | `count_splits_exact` (deterministic) | src/consensus.cpp:86 | Builds packed bitkey by L..R loop/cluster ⇒ O(k·n·h); slow for tall trees. | tbd | — | NEW |
+| 3 | `ClusterTable` constructor | inst/include/TreeTools/ClusterTable.h:352 | Built once per tree per call ⇒ k constructions; `root_on_node` + allocs. | tbd | — | NEW |
+| 4 | `Consensus()` R wrapper | R/Consensus.R:46 | Per-call R overhead: metadata strip (lapply), NTip, RenumberTips, Preorder, splits_to_edge, RootTree. May dominate at low-n/high-k. | tbd | — | NEW |
+| 5 | strict single-ref path | src/consensus.cpp:165 | p=1 path; already O(kn) optimal. Profile only to confirm AT-LIMIT. | tbd | — | NEW |
+| 6 | `for_each_internal_node` primitive | src/consensus.cpp:44 | Shared postorder/stack arithmetic backing #1,#2,#5. | tbd | — | NEW |
+| 7 | Jansson core (to be written) | src/consensus.cpp (new) | Deterministic O(kn) candidate-tree path; compare vs #1,#2. | tbd | — | NEW |
+
+Parked (called once/session; not in rotation): IO, label checking.
diff --git a/dev/profiling/log.md b/dev/profiling/log.md
new file mode 100644
index 000000000..6e46def4d
--- /dev/null
+++ b/dev/profiling/log.md
@@ -0,0 +1,88 @@
+# Profiling log — Consensus
+
+last_focus: 1
+
+---
+
+## Round 1 — deferred materialisation + Jansson decision (2026-06-02)
+
+**Optimisation C-001 (DONE, gated, measured).** Hashed counter now defers
+split-pattern materialisation: each distinct split keeps a `(tree,L,R)` witness;
+only splits reaching `thresh` are materialised (consensus), or all (split_freq).
+Gate: 576 checks PASS. Measured (compare-grids.R): hashed median x1.23, with the
+wins where it was weakest — **tall(10000,5) x13.0**, tall(5000,10) x8.6,
+rand(10000,5) x2.5, tall/rand high-k/high-n x1.5–2.9. nsplits identical in every
+cell (no behaviour change). exact untouched (x1.02).
+
+**Jansson decision (bound re-run vs OPTIMISED hashed, + concord70).**
+- Concordant (realistic) high-k/high-n: 2*strict ≥ hashed (x1.24–1.32) ⇒
+ **Jansson can't win**. Same for all high-k/low-n.
+- **Moderate conflict** (realistic gene-tree / bootstrap-with-rogues: base + ~5%
+ rogue taxa relocated/replicate) at high-k/high-n: 2*strict ≥ hashed
+ (2000,500: 1.54; 1000,1000: 1.42) ⇒ **Jansson can't win**. low-k/high-n
+ (5000,10): 1.09 ⇒ loses.
+- Only NOT-proven cells: (10000,5) (any concordance) — but <60ms absolute, a 2×
+ gap is irrelevant; and high-k/high-n EXTREME conflict (rand/tall) — degenerate
+ near-star consensus, not a real input.
+- **DECISION (advisor-confirmed): do NOT implement full Jansson.** Proven to lose
+ (2*strict ≥ hashed) in every realistic, time-meaningful regime. Rationale:
+ user's "avoid redundant code"; high correctness risk (FACT ships broken
+ majority); collision risk waived. Honest scope: this is a rigorous *lower-bound*
+ result where it concludes — not a direct Jansson measurement; the un-proven
+ cells are negligible-time or degenerate. Re-openable (algorithm + subroutine
+ specs in PLAN-consensus.md / DECISION.md). See **DECISION.md** (user-facing).
+
+**Advisor passes (2):** (1) flagged compare-grids' nsplits check is COUNT not
+CONTENT ⇒ added high-n (n=1000/3000) hashed==exact split-SET content cells;
+confirmed verify-consensus.R green; added moderate-conflict generator. (2) flagged
+the symmetric gap — the witness change also rewired SplitFrequency's hashed
+assembly, only checked at n≤30 ⇒ added high-n (n=2000/5000) SplitFrequency
+hashed==exact split-SET+COUNT cells. Gate now 590 checks PASS. Every modified
+code path gated at shipping scale (+ test-consensus 8/8, test-Support 6/6).
+Advisor also corrected its earlier "commit" → do NOT commit (on main, not asked).
+
+**Next (if continuing):** C-002 wrapper fast-path (high-k/low-n, 57% overhead) —
+the remaining real throughput win, in a target regime. Shared-code (RenumberTips)
+risk ⇒ gate with full test suite.
+
+---
+
+## Round 0 — scaffold + baseline + Jansson lower bound (2026-06-02)
+
+**Setup.** Installed -O2 -g build into scratch lib (`.libpath.txt`); all drivers
+load from it (not load_all). Built the correctness gate
+(`correctness-gate.R`, 576 checks: hashed==exact everywhere; ape agrees at p=0.5)
+and the benchmark grid (`drivers/consensus-grid.R` → `results-grid.csv`,
+summarised in `baselines.md`).
+
+**Correctness fixes made en route (real bugs):**
+- gate `which.min()` → `integer(0)` silently skipped EVERY ape check;
+ fixed to tip-index 1. Hard hashed==exact gate was still valid (both used the
+ same broken-but-consistent canonicaliser).
+- `dev/red-team/verify-consensus.R` called `consensus_tree(forest, p, hash=FALSE)`
+ but the export arg is `exact` → "unused argument" → that adversarial gate had
+ been DEAD since the rename. Fixed to `exact = TRUE` (lines 49, 77).
+- Threshold-convention finding (surfaced to MRS, not changed): TreeTools keeps
+ `count > k*p`; ape keeps `>= k*p` for p>0.5; roxygen says "p or more" (>=) but
+ code is strict (>). Out of scope here.
+
+**Baseline result.** hashed beats exact in all 80 cells (1.1x–3x). R-wrapper is
+57% of end-to-end at high-k/low-n (30,5000). See `baselines.md`.
+
+**Jansson lower bound** (`drivers/jansson-bound.R`; advisor-validated: Jansson ≥
+2×strict-path). Verdict per cell:
+- low-n/high-k: 2*strict ≥ hashed (ratio 1.1–1.4) ⇒ **Jansson can't win** there.
+- HIGH-n (all shapes): 2*strict ≪ hashed (ratio 0.07–0.51) ⇒ **INCONCLUSIVE**.
+ Strict is 5–30× faster than hashed-majority at high-n because hashed handles
+ ~k·n DISTINCT splits (big hash map + per-split materialisation) while strict
+ matches against ONE reference's O(n) clusters. Jansson's candidate tree is also
+ O(n) clusters ⇒ it plausibly shares strict's advantage and could BEAT hashed at
+ high-n. **Must implement Jansson to settle this** (user asked; advisor concurs).
+
+**Caveat noticed:** cost structure is subtler than "materialisation dominates"
+(identical is *slower* than rand at (10000,5) despite fewer distinct splits) ⇒
+must VTune the core, not guess, before optimising it.
+
+**Next:** (R1) profvis the R wrapper at high-k/low-n → fast-path. (R2) VTune the
+hashed/exact core at high-n to find the true hotspot. (R3) implement Jansson
+(R-prototype-first, gated) and benchmark vs optimised hashed.
diff --git a/dev/red-team/verify-consensus.R b/dev/red-team/verify-consensus.R
index 76ea701ef..265bc3f47 100644
--- a/dev/red-team/verify-consensus.R
+++ b/dev/red-team/verify-consensus.R
@@ -85,8 +85,8 @@ for (n_tip in c(4, 6, 8, 13, 30)) {
for (k in c(1, 2, 5, 12, 40)) {
forest <- lapply(seq_len(k), function(i) ape::rtree(n_tip, br = NULL))
class(forest) <- "multiPhylo"
- sh <- SplitFrequency(forest, exact = FALSE)
- se <- SplitFrequency(forest, exact = TRUE)
+ se <- SplitFrequency(forest, hash = FALSE)
+ sh <- SplitFrequency(forest, hash = TRUE)
# Compare as (split -> count) maps, order-independent
key <- function(s) {
if (length(s) == 0) return(character(0))
diff --git a/man/Consensus.Rd b/man/Consensus.Rd
index 560ec9c71..911129ab7 100644
--- a/man/Consensus.Rd
+++ b/man/Consensus.Rd
@@ -9,9 +9,10 @@ Consensus(trees, p = 1, check.labels = TRUE, hash = TRUE)
\arguments{
\item{trees}{List of trees, optionally of class \code{multiPhylo}.}
-\item{p}{Proportion of trees that must contain a split for it to be reported
-in the consensus. \code{p = 0.5} gives the majority-rule consensus; \code{p = 1} (the
-default) gives the strict consensus.}
+\item{p}{A number from 0.5 to 1 giving the proportion of trees that must
+contain a split for it to be reported in the consensus: from \code{p = 0.5} (more
+than half the trees; the majority-rule consensus) to \code{p = 1} (every tree; the
+strict consensus, the default).}
\item{check.labels}{Logical specifying whether to check that all trees have
identical labels. Defaults to \code{TRUE}, which is slower.}
@@ -35,10 +36,13 @@ The strict consensus (\code{p = 1}) compares the clusters of the first tree
against every other tree in linear time. The majority-rule and threshold
consensus (\verb{0.5 <= p < 1}) instead count the frequency of every split across
all trees in a single pass and retain those occurring in a proportion \code{p} or
-more of trees; this runs in time linear in the number of trees, after
-\insertCite{Jansson2016}{TreeTools}. By default the count uses
-a 128-bit hash, whose results are exact with overwhelming probability; set
-\code{hash = FALSE} for a slower but guaranteed-exact count.
+more of trees (i.e. in at least \code{ceiling(p * length(trees))} trees); this
+runs in time linear in the number of trees, after
+\insertCite{Jansson2016}{TreeTools}. The majority threshold \code{p = 0.5} is
+strict: a split is retained only if it occurs in \emph{more} than half the trees,
+so that two conflicting splits can never both be reported. By default the
+count uses a 128-bit hash, whose results are exact with overwhelming
+probability; set \code{hash = FALSE} for a slower but guaranteed-exact count.
}
\examples{
Consensus(as.phylo(0:2, 8))
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 36c2de209..4a9110e0d 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -326,6 +326,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
+// renumber_tips_to
+SEXP renumber_tips_to(Rcpp::List trees, const Rcpp::CharacterVector target);
+RcppExport SEXP _TreeTools_renumber_tips_to(SEXP treesSEXP, SEXP targetSEXP) {
+BEGIN_RCPP
+ Rcpp::RObject rcpp_result_gen;
+ Rcpp::RNGScope rcpp_rngScope_gen;
+ Rcpp::traits::input_parameter< Rcpp::List >::type trees(treesSEXP);
+ Rcpp::traits::input_parameter< const Rcpp::CharacterVector >::type target(targetSEXP);
+ rcpp_result_gen = Rcpp::wrap(renumber_tips_to(trees, target));
+ return rcpp_result_gen;
+END_RCPP
+}
// cpp_edge_to_splits
Rcpp::RawMatrix cpp_edge_to_splits(const Rcpp::IntegerMatrix& edge, const Rcpp::IntegerVector& order, const Rcpp::IntegerVector& nTip);
RcppExport SEXP _TreeTools_cpp_edge_to_splits(SEXP edgeSEXP, SEXP orderSEXP, SEXP nTipSEXP) {
@@ -562,6 +574,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_TreeTools_node_depth_unrooted", (DL_FUNC) &_TreeTools_node_depth_unrooted, 4},
{"_TreeTools_path_lengths", (DL_FUNC) &_TreeTools_path_lengths, 3},
{"_TreeTools_renumber_tips_batch", (DL_FUNC) &_TreeTools_renumber_tips_batch, 4},
+ {"_TreeTools_renumber_tips_to", (DL_FUNC) &_TreeTools_renumber_tips_to, 2},
{"_TreeTools_cpp_edge_to_splits", (DL_FUNC) &_TreeTools_cpp_edge_to_splits, 3},
{"_TreeTools_duplicated_splits", (DL_FUNC) &_TreeTools_duplicated_splits, 2},
{"_TreeTools_mask_splits", (DL_FUNC) &_TreeTools_mask_splits, 1},
diff --git a/src/consensus.cpp b/src/consensus.cpp
index a8bc892e7..61617af40 100644
--- a/src/consensus.cpp
+++ b/src/consensus.cpp
@@ -8,6 +8,7 @@ using namespace Rcpp;
#include /* for fill */
#include /* for array */
#include /* for steady_clock (interrupt timing) */
+#include /* for ceil, round, fabs (threshold) */
#include /* for uint64_t (split hashing) */
#include /* for string (hash key) */
#include /* for unordered_map */
@@ -19,6 +20,26 @@ using TreeTools::ClusterTable;
struct StackEntry { int32 L, R, N, W; };
+// A distinct split's "witness": tree index t and the [L, R] encode-range that
+// the split spans in THAT tree's own ClusterTable encoding (where every cluster
+// is contiguous). The split's leaf set is reconstructed on demand as
+// {tables[t].DECODE(j) : L <= j <= R}. Lets the hashed counter defer building
+// the packed bit pattern until thresholding, so non-surviving splits (the bulk,
+// on low-concordance input) are never materialised.
+struct SplitWitness { int32 t, L, R; };
+
+// Reconstruct a split's packed bit pattern from its witness into row `dst`
+// (nbin bytes, pre-zeroed by caller). tables is non-const (DECODE mutates none
+// of the observable state but is not const-qualified).
+inline void materialise_witness(std::vector& tables,
+ const SplitWitness& w, Rbyte* const dst) {
+ ClusterTable& tab = tables[w.t];
+ for (int32 j = w.L; j <= w.R; ++j) {
+ const int32 leaf_idx = tab.DECODE(j) - 1; // 0-based original leaf id
+ dst[leaf_idx >> 3] |= static_cast(1 << (leaf_idx & 7));
+ }
+}
+
// Throttled (~1 s) user-interrupt check.
inline void throttled_interrupt(
std::chrono::steady_clock::time_point& last) {
@@ -123,8 +144,7 @@ void count_splits_exact(std::vector& tables, const int32 n_tip,
// for the majority/threshold consensus. Non-dependent name in the template
// below, so it must be declared first.
void count_splits_hashed(std::vector& tables, const int32 n_tip,
- const int32 nbin,
- std::vector>& split_patterns,
+ std::vector& witnesses,
std::vector& counts);
// ---------------------------------------------------------------------------
@@ -145,8 +165,23 @@ RawMatrix calc_consensus_tree(
StackContainer& S
) {
const int32 n_trees = trees.length();
- const int32 frac_thresh = int32(n_trees * p[0]) + 1;
- const int32 thresh = frac_thresh > n_trees ? n_trees : frac_thresh;
+ // Retain a split iff it occurs in a proportion `p` or more of trees, i.e.
+ // count >= ceil(p * n_trees) -- the documented semantics, matching
+ // ape::consensus(). A small relative tolerance snaps p * n_trees back to an
+ // exact integer when floating-point rounding nudges it (e.g. 3 * (2/3) is not
+ // exactly 2), so a split in exactly p * n trees is kept, not dropped by an
+ // off-by-one. The majority threshold p = 0.5 is strict (count > n_trees / 2):
+ // two conflicting splits each in exactly half the trees must not both survive,
+ // else the retained splits would not be pairwise compatible.
+ const double pn = double(n_trees) * p[0];
+ const double pn_round = std::round(pn);
+ const int32 ceil_pn =
+ (std::fabs(pn - pn_round) < 1e-9 * (pn > 1.0 ? pn : 1.0))
+ ? int32(pn_round)
+ : int32(std::ceil(pn));
+ const int32 strict_floor = n_trees / 2 + 1; // floor(n / 2) + 1
+ const int32 want = ceil_pn > strict_floor ? ceil_pn : strict_floor;
+ const int32 thresh = want > n_trees ? n_trees : want;
std::vector tables;
tables.reserve(n_trees);
@@ -209,16 +244,11 @@ RawMatrix calc_consensus_tree(
if (splits_found == ntip_3) return ret;
}
}
- } else {
- // ---- Majority / threshold: count then threshold -----------------------
+ } else if (exact) {
+ // ---- Majority / threshold, deterministic: count then threshold --------
std::vector> split_patterns;
std::vector counts;
- if (exact) {
- count_splits_exact(tables, n_tip, nbin, S, split_patterns, counts);
- } else {
- count_splits_hashed(tables, n_tip, nbin, split_patterns, counts);
- }
-
+ count_splits_exact(tables, n_tip, nbin, S, split_patterns, counts);
const int32 n_distinct = int32(split_patterns.size());
for (int32 i = 0; i < n_distinct; ++i) {
if (counts[i] >= thresh) {
@@ -229,6 +259,25 @@ RawMatrix calc_consensus_tree(
if (splits_found == ntip_3) return ret;
}
}
+ } else {
+ // ---- Majority / threshold, hashed: count, then materialise ONLY the
+ // splits that reach the threshold (the rest are never built). --------
+ std::vector witnesses;
+ std::vector counts;
+ count_splits_hashed(tables, n_tip, witnesses, counts);
+ const int32 n_distinct = int32(witnesses.size());
+ std::vector row(nbin);
+ for (int32 i = 0; i < n_distinct; ++i) {
+ if (counts[i] >= thresh) {
+ std::fill(row.begin(), row.end(), Rbyte(0));
+ materialise_witness(tables, witnesses[i], row.data());
+ for (int32 c = 0; c < nbin; ++c) {
+ ret(splits_found, c) = row[c];
+ }
+ ++splits_found;
+ if (splits_found == ntip_3) return ret;
+ }
+ }
}
return (splits_found == 0) ? RawMatrix(0, nbin) :
@@ -241,8 +290,10 @@ RawMatrix calc_consensus_tree(
// A single O(kn) pass: each non-trivial cluster is identified by a 128-bit
// subtree hash = the (order-independent) sum of its leaves' fixed splitmix64
// hashes, so the same split in different trees hashes identically without an
-// O(cluster size) key. Counts accumulate directly; the bit pattern is
-// materialised only the first time a split is seen. Exactness is therefore
+// O(cluster size) key. Counts accumulate directly; materialisation of the
+// packed bit pattern is DEFERRED — each distinct split keeps only a witness
+// (tree + encode-range), so the bulk of splits that never reach the consensus
+// threshold are never materialised (the dominant cost at high n). Exactness is
// probabilistic (a 128-bit collision, ~1e-30, would conflate two splits).
inline uint64_t splitmix64(uint64_t x) {
x += 0x9e3779b97f4a7c15ULL;
@@ -265,8 +316,7 @@ struct Hash128Hasher {
};
void count_splits_hashed(std::vector& tables, const int32 n_tip,
- const int32 nbin,
- std::vector>& split_patterns,
+ std::vector& witnesses,
std::vector& counts) {
const int32 n_trees = int32(tables.size());
const int32 ntip_3 = n_tip - 3;
@@ -324,13 +374,10 @@ void count_splits_hashed(std::vector& tables, const int32 n_tip,
const Hash128 hkey{hlo, hhi};
auto it = split_map.find(hkey);
if (it == split_map.end()) {
- split_map.emplace(hkey, int32(split_patterns.size()));
- std::vector pattern(nbin, 0);
- for (int32 j = L; j <= R; ++j) {
- const int32 leaf_idx = tree.DECODE(j) - 1;
- pattern[leaf_idx >> 3] |= (Rbyte(1) << (leaf_idx & 7));
- }
- split_patterns.push_back(std::move(pattern));
+ split_map.emplace(hkey, int32(witnesses.size()));
+ // Defer materialisation: record where this split lives (tree t,
+ // encode-range [L, R]) and rebuild the pattern later only if needed.
+ witnesses.push_back({t, L, R});
counts.push_back(1);
} else {
++counts[it->second];
@@ -356,6 +403,27 @@ inline List frequencies_list(
return List::create(Named("splits") = ret, Named("counts") = count_vec);
}
+// As frequencies_list, but materialising each split's pattern from its witness
+// (split_frequencies needs every distinct split, so all are built — same total
+// work as eager, just deferred, with less peak memory: no vector-of-patterns).
+inline List frequencies_list_from_witnesses(
+ std::vector& tables,
+ const std::vector& witnesses,
+ const std::vector& counts, const int32 nbin) {
+ const int32 splits_found = int32(witnesses.size());
+ RawMatrix ret(splits_found, nbin);
+ std::vector row(nbin);
+ for (int32 r = 0; r < splits_found; ++r) {
+ std::fill(row.begin(), row.end(), Rbyte(0));
+ materialise_witness(tables, witnesses[r], row.data());
+ for (int32 c = 0; c < nbin; ++c) {
+ ret(r, c) = row[c];
+ }
+ }
+ IntegerVector count_vec(counts.begin(), counts.end());
+ return List::create(Named("splits") = ret, Named("counts") = count_vec);
+}
+
List calc_split_frequencies_hashed(const List& trees, const int32 n_tip) {
const int32 n_trees = trees.length();
std::vector tables;
@@ -364,10 +432,10 @@ List calc_split_frequencies_hashed(const List& trees, const int32 n_tip) {
tables.emplace_back(ClusterTable(Rcpp::List(trees(i))));
}
const int32 nbin = (n_tip + 7) / 8;
- std::vector> split_patterns;
+ std::vector witnesses;
std::vector counts;
- count_splits_hashed(tables, n_tip, nbin, split_patterns, counts);
- return frequencies_list(split_patterns, counts, nbin);
+ count_splits_hashed(tables, n_tip, witnesses, counts);
+ return frequencies_list_from_witnesses(tables, witnesses, counts, nbin);
}
// Exact split frequencies: the same single-pass count, returned in full.
diff --git a/src/renumber_tips.cpp b/src/renumber_tips.cpp
index 437fd7d70..3573ff7f2 100644
--- a/src/renumber_tips.cpp
+++ b/src/renumber_tips.cpp
@@ -1,6 +1,11 @@
#include
using namespace Rcpp;
+#include /* for fill */
+#include /* for string (label keys) */
+#include /* for unordered_map */
+#include /* for vector */
+
// Apply a precomputed tip permutation to every tree in batch.
// Returns a list of modified phylo objects (cloned, with new
// edge matrices, updated tip.label, and "preorder" downgraded to "cladewise").
@@ -57,3 +62,96 @@ Rcpp::List renumber_tips_batch(
return result;
}
+
+// Renumber every tree in an UNLABELLED forest so its tips follow `target`'s
+// order, entirely in C++. Unlike renumber_tips_batch (one shared permutation
+// for a labelled multiPhylo), each tree may carry its own tip.label order, so a
+// per-tree permutation is derived from a single hash of `target`. This is the
+// fast path for RenumberTips on a plain list / unlabelled multiPhylo, replacing
+// a per-tree R loop.
+//
+// Returns the relabelled list, OR R_NilValue to tell the R caller to fall back
+// to the per-tree R path (RenumberTips.phylo). NULL is returned for anything
+// this fast path does not handle identically to RenumberTips.phylo: an element
+// that is not a phylo-like list or lacks `edge`/`tip.label`; a tree whose label
+// set differs from `target` (different length, an unknown or NA label, or a
+// non-bijection / duplicate); or duplicate/NA labels in `target` itself. The
+// common case — every tree is a permutation of `target` — is handled here.
+//
+// A tree already in `target` order is returned unchanged (no clone), matching
+// RenumberTips.phylo's no-op return.
+// [[Rcpp::export]]
+SEXP renumber_tips_to(Rcpp::List trees, const Rcpp::CharacterVector target) {
+ const int n_tip = target.size();
+ const int n_trees = trees.size();
+ if (n_tip == 0 || n_trees == 0) return R_NilValue;
+
+ // target label -> 1-based new tip index
+ std::unordered_map pos;
+ pos.reserve(n_tip * 2);
+ for (int i = 0; i < n_tip; ++i) {
+ SEXP s = STRING_ELT(target, i);
+ if (s == NA_STRING) return R_NilValue;
+ pos.emplace(std::string(CHAR(s)), i + 1);
+ }
+ if (static_cast(pos.size()) != n_tip) return R_NilValue; // dup target labels
+
+ Rcpp::List result(n_trees);
+ std::vector perm(n_tip + 1);
+ std::vector used(n_tip + 1);
+
+ for (int t = 0; t < n_trees; ++t) {
+ SEXP orig = trees[t];
+ if (!Rf_isVectorList(orig)) return R_NilValue;
+ Rcpp::List tree_i(orig);
+ if (!tree_i.containsElementNamed("tip.label") ||
+ !tree_i.containsElementNamed("edge")) return R_NilValue;
+
+ SEXP lab = tree_i["tip.label"];
+ if (TYPEOF(lab) != STRSXP || Rf_length(lab) != n_tip) return R_NilValue;
+
+ std::fill(used.begin(), used.end(), char(0));
+ bool already = true;
+ for (int i = 0; i < n_tip; ++i) {
+ SEXP s = STRING_ELT(lab, i);
+ if (s == NA_STRING) return R_NilValue;
+ auto it = pos.find(std::string(CHAR(s)));
+ if (it == pos.end()) return R_NilValue; // label not in target
+ const int new_idx = it->second;
+ if (used[new_idx]) return R_NilValue; // non-bijection (dup label)
+ used[new_idx] = 1;
+ perm[i + 1] = new_idx;
+ if (new_idx != i + 1) already = false;
+ }
+
+ if (already) { // no relabel needed
+ result[t] = orig;
+ continue;
+ }
+
+ SEXP orig_class = Rf_getAttrib(orig, R_ClassSymbol);
+ Rcpp::List tree_c = Rcpp::clone(tree_i);
+ Rcpp::IntegerMatrix edge =
+ Rcpp::clone(Rcpp::as(tree_c["edge"]));
+ const int n_edge = edge.nrow();
+ for (int j = 0; j < n_edge; ++j) {
+ int& child = edge(j, 1);
+ if (child <= n_tip) child = perm[child];
+ }
+ tree_c["edge"] = edge;
+ tree_c["tip.label"] = target;
+ if (tree_c.hasAttribute("order")) {
+ Rcpp::CharacterVector ord = tree_c.attr("order");
+ if (ord.size() && ord[0] == "preorder") {
+ tree_c.attr("order") = Rcpp::CharacterVector::create("cladewise");
+ }
+ }
+ if (orig_class != R_NilValue) {
+ Rf_setAttrib(Rcpp::wrap(tree_c), R_ClassSymbol, orig_class);
+ }
+ result[t] = tree_c;
+ }
+ // Preserve list names (the R lapply path this replaces keeps them).
+ result.attr("names") = trees.attr("names");
+ return result;
+}
diff --git a/tests/testthat/test-consensus.R b/tests/testthat/test-consensus.R
index a939d90d3..807bfda21 100644
--- a/tests/testthat/test-consensus.R
+++ b/tests/testthat/test-consensus.R
@@ -17,12 +17,17 @@ test_that("Consensus() errors", {
expect_equal(Consensus(c(halfTree)), halfTree)
expect_equal(Consensus(list(halfTree)), halfTree)
+ # Building 100k-leaf trees and a 33k-tip consensus dominates valgrind runtime
+ # (~30x slowdown) but exercises no memory path the smaller cases above miss;
+ # full scale still runs in uninstrumented CI.
+ skip_if(Sys.getenv("TESTING_MEMCHECK") != "")
+
# Test that trees larger than heap limit fail gracefully
expect_error(
Consensus(c(BalancedTree(100001), PectinateTree(100001))),
"too many leaves.*100000"
)
-
+
largeTree <- BalancedTree(33333)
consensus_large <- Consensus(c(largeTree, largeTree))
expect_equal(NTip(consensus_large), 33333)
@@ -47,6 +52,24 @@ test_that("Consensus()", {
ApeTest(trees, 0.5)
expect_equal(Consensus(trees, 0.999), Consensus(trees, 1))
+ # Even-n exact tie: two conflicting splits each in exactly 50% of trees.
+ # Strict majority keeps a split only when count > n / 2 (thresh =
+ # floor(n / 2) + 1), so both conflicting splits are dropped and the consensus
+ # is unresolved there -- always a valid tree. The "exact == hashed" test
+ # below exercises this fixture too, but only against the other internal
+ # counter; pin it to an independent oracle (ape) here.
+ tie <- c(rep(list(BalancedTree(8)), 2L), rep(list(PectinateTree(8)), 2L))
+ tie <- RenumberTips(tie, tie[[1]])
+ ApeTest(tie, 0.5)
+
+ # p > 0.5 boundary: a split in exactly p * n trees is retained ("p or more"),
+ # matching ape. Clade {t1, t2} occurs in 2 of 3 trees (= 2/3 exactly).
+ thirds <- lapply(c("(((t1, t2), t3), t4, t5, t6);",
+ "(((t1, t2), t3), t4, t5, t6);",
+ "((t1, (t2, t3)), t4, t5, t6);"),
+ function(x) ape::read.tree(text = x))
+ ApeTest(thirds, 2 / 3)
+
ApeTest(as.phylo(0:2, 8))
ApeTest(as.phylo(0:250, 8))
ApeTest(as.phylo(0:250, 80))
@@ -56,7 +79,44 @@ test_that("Consensus()", {
expect_equal(Consensus(trees), Preorder(StarTree(letters[1:4])))
})
+test_that("Consensus() ignores edge lengths, node labels, and TipLabel", {
+ base <- list(BalancedTree(8), PectinateTree(8))[c(1, 1, 1, 1, 2, 2, 2)]
+ base <- structure(RenumberTips(base, base[[1]]), class = "multiPhylo")
+ withEL <- structure(lapply(base, function(t) {
+ t[["edge.length"]] <- seq_len(nrow(t[["edge"]])); t
+ }), class = "multiPhylo")
+ withNL <- structure(lapply(base, function(t) {
+ t[["node.label"]] <- paste0("n", seq_len(t[["Nnode"]])); t
+ }), class = "multiPhylo")
+ labelled <- ape::.compressTipLabel(base)
+ expect_true(is.null(.subset2(labelled, 1)[["tip.label"]])) # genuinely labelled
+
+ for (p in c(0.5, 2 / 3, 1)) for (check in c(TRUE, FALSE)) {
+ ref <- Consensus(base, p, check.labels = check)
+ expect_equal(Consensus(withEL, p, check.labels = check), ref)
+ expect_equal(Consensus(withNL, p, check.labels = check), ref)
+ expect_equal(Consensus(labelled, p, check.labels = check), ref)
+ }
+ # Result carries no inherited metadata.
+ out <- Consensus(withEL, 0.5)
+ expect_null(out[["edge.length"]])
+ expect_null(out[["node.label"]])
+
+ # Differing tree sizes exercise the KeepTip repeat-loop; metadata must not
+ # change its (warned) result either.
+ mixed <- list(BalancedTree(8), PectinateTree(8), DropTip(BalancedTree(8), 1:2))
+ mixedEL <- lapply(mixed, function(t) {
+ t[["edge.length"]] <- seq_len(nrow(t[["edge"]])); t
+ })
+ expect_equal(suppressWarnings(Consensus(mixedEL, 0.5)),
+ suppressWarnings(Consensus(mixed, 0.5)))
+})
+
test_that("Consensus() handles large sets of trees", {
+ # 33k+ trees (past int16_max) is prohibitively slow under valgrind for no
+ # extra memory coverage: the hashed counter is exercised by the smaller
+ # forests above. Runs at full scale in uninstrumented CI.
+ skip_if(Sys.getenv("TESTING_MEMCHECK") != "")
oneTree <- as.phylo(0, 13)
expect_equal(
Consensus(lapply(1:33000, function(x) oneTree)),
diff --git a/tests/testthat/test-tree_numbering.R b/tests/testthat/test-tree_numbering.R
index 22d75bfe8..9dfa51b03 100644
--- a/tests/testthat/test-tree_numbering.R
+++ b/tests/testthat/test-tree_numbering.R
@@ -177,7 +177,7 @@ test_that("RenumberTips.multiPhylo() covers edge cases", {
result <- RenumberTips(mp8, 8:1)
expect_equal(TipLabels(result[[1]]), paste0("t", 8:1))
- # No-op when order already matches (unlabelled: per-tree fallback)
+ # No-op when order already matches (unlabelled: C++ pass-through, unchanged)
result_noop <- RenumberTips(mp8, TipLabels(mp8[[1]]))
expect_equal(result_noop[[1]][["edge"]], mp8[[1]][["edge"]])
expect_equal(result_noop[[2]][["edge"]], mp8[[2]][["edge"]])
@@ -201,6 +201,29 @@ test_that("RenumberTips.multiPhylo() covers edge cases", {
)
})
+test_that("RenumberTips() unlabelled C++ path matches the per-tree path", {
+ # The unlabelled multiPhylo / list fast path (renumber_tips_to in C++) must
+ # produce results IDENTICAL to applying RenumberTips.phylo per tree, across
+ # mixed tip orderings (incl. trees already in target order -> pass-through).
+ set.seed(91)
+ canon <- paste0("t", 1:12)
+ forest <- c(
+ list(BalancedTree(canon), PectinateTree(canon)), # already in order
+ lapply(1:6, function(i) ape::rtree(12, br = NULL)) # random orders
+ )
+ for (target in list(canon, rev(canon))) {
+ ref <- lapply(forest, function(tr) RenumberTips(tr, target)) # per-tree R path
+ mp <- RenumberTips(structure(forest, class = "multiPhylo"), target)
+ ls <- RenumberTips(forest, target) # list method
+ for (i in seq_along(forest)) {
+ expect_identical(mp[[i]][["edge"]], ref[[i]][["edge"]])
+ expect_identical(mp[[i]][["tip.label"]], ref[[i]][["tip.label"]])
+ expect_identical(attr(mp[[i]], "order"), attr(ref[[i]], "order"))
+ expect_identical(ls[[i]][["edge"]], ref[[i]][["edge"]])
+ }
+ }
+})
+
test_that("RenumberTips.multiPhylo() materializes tip.label", {
# TipLabel-labelled multiPhylo: individual trees should gain tip.label
# even when the order already matches (needed by RoguePlot and others